Compare commits

..

2 commits

Author SHA1 Message Date
Agent
db009e3a08 Allow known duplicate test patterns for duplicate service detection tests
Some checks failed
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/pr/ci Pipeline was successful
ci/woodpecker/pr/smoke-init Pipeline failed
2026-04-16 13:45:55 +00:00
Agent
2da32d474a fix: Compose generator should detect duplicate service names at generate-time (#850) 2026-04-16 13:45:55 +00:00
77 changed files with 884 additions and 5231 deletions

View file

@ -32,10 +32,13 @@ FORGE_URL=http://localhost:3000 # [CONFIG] local Forgejo instance
# - FORGE_PASS_DEV_QWEN2 # - FORGE_PASS_DEV_QWEN2
# Name conversion: tr 'a-z-' 'A-Z_' (lowercase→UPPER, hyphens→underscores). # Name conversion: tr 'a-z-' 'A-Z_' (lowercase→UPPER, hyphens→underscores).
# The compose generator looks these up via the agent's `forge_user` field in # The compose generator looks these up via the agent's `forge_user` field in
# the project TOML. Configure local-model agents via [agents.X] sections in # the project TOML. The pre-existing `dev-qwen` llama agent uses
# projects/*.toml — this is the canonical activation path. # FORGE_TOKEN_LLAMA / FORGE_PASS_LLAMA (kept for backwards-compat with the
# legacy `ENABLE_LLAMA_AGENT=1` single-agent path).
FORGE_TOKEN= # [SECRET] dev-bot API token (default for all agents) FORGE_TOKEN= # [SECRET] dev-bot API token (default for all agents)
FORGE_PASS= # [SECRET] dev-bot password for git HTTP push (#361) FORGE_PASS= # [SECRET] dev-bot password for git HTTP push (#361)
FORGE_TOKEN_LLAMA= # [SECRET] dev-qwen API token (for agents-llama)
FORGE_PASS_LLAMA= # [SECRET] dev-qwen password for git HTTP push
FORGE_REVIEW_TOKEN= # [SECRET] review-bot API token FORGE_REVIEW_TOKEN= # [SECRET] review-bot API token
FORGE_REVIEW_PASS= # [SECRET] review-bot password for git HTTP push FORGE_REVIEW_PASS= # [SECRET] review-bot password for git HTTP push
FORGE_PLANNER_TOKEN= # [SECRET] planner-bot API token FORGE_PLANNER_TOKEN= # [SECRET] planner-bot API token
@ -104,6 +107,13 @@ FORWARD_AUTH_SECRET= # [SECRET] Shared secret for Caddy ↔
# Store all project secrets here so formulas reference env vars, never hardcode. # Store all project secrets here so formulas reference env vars, never hardcode.
BASE_RPC_URL= # [SECRET] on-chain RPC endpoint BASE_RPC_URL= # [SECRET] on-chain RPC endpoint
# ── Local Qwen dev agent (optional) ──────────────────────────────────────
# Set ENABLE_LLAMA_AGENT=1 to emit agents-llama in docker-compose.yml.
# Requires a running llama-server reachable at ANTHROPIC_BASE_URL.
# See docs/agents-llama.md for details.
ENABLE_LLAMA_AGENT=0 # [CONFIG] 1 = enable agents-llama service
ANTHROPIC_BASE_URL= # [CONFIG] e.g. http://host.docker.internal:8081
# ── Tuning ──────────────────────────────────────────────────────────────── # ── Tuning ────────────────────────────────────────────────────────────────
CLAUDE_TIMEOUT=7200 # [CONFIG] max seconds per Claude invocation CLAUDE_TIMEOUT=7200 # [CONFIG] max seconds per Claude invocation

1
.gitignore vendored
View file

@ -20,6 +20,7 @@ metrics/supervisor-metrics.jsonl
# OS # OS
.DS_Store .DS_Store
dev/ci-fixes-*.json dev/ci-fixes-*.json
gardener/dust.jsonl
# Individual encrypted secrets (managed by disinto secrets add) # Individual encrypted secrets (managed by disinto secrets add)
secrets/ secrets/

View file

@ -294,13 +294,9 @@ def main() -> int:
"9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern", "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern",
# Standard lib source block shared across formula-driven agent run scripts # Standard lib source block shared across formula-driven agent run scripts
"330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)", "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)",
# Common vault-seed script patterns: logging helpers + flag parsing # Test files for duplicate service detection (#850)
# Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh "334967b8b4f1a8d3b0b9b8e0912f3bfb": "TOML [agents.llama] block in smoke-init.sh and test-duplicate-service-detection.sh",
"843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)", "d82f30077e5bb23b5fc01db003033d5d": "TOML base_url/model/roles in smoke-init.sh and test-duplicate-service-detection.sh",
"ee51df9642f2ef37af73b0c15f4d8406": "Logging helpers + DRY_RUN loop start (vault-seed-woodpecker + wp-oauth-register)",
"9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)",
"9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)",
"5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)",
} }
if not sh_files: if not sh_files:

View file

@ -1,21 +1,16 @@
# ============================================================================= # =============================================================================
# .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts # .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts
# #
# Part of the Nomad+Vault migration (S0.5, issue #825; extended in S2.6, # Part of the Nomad+Vault migration (S0.5, issue #825). Locks in the
# issue #884). Locks in the "no-ad-hoc-steps" principle: every HCL/shell # "no-ad-hoc-steps" principle: every HCL/shell artifact under nomad/ or
# artifact under nomad/, lib/init/nomad/, vault/policies/, plus the # lib/init/nomad/, plus the `disinto init` dispatcher, gets checked
# `disinto init` dispatcher and vault/roles.yaml, gets checked before it # before it can land.
# can land.
# #
# Triggers on PRs (and pushes) that touch any of: # Triggers on PRs (and pushes) that touch any of:
# nomad/** — HCL configs (server, client, vault) # nomad/** — HCL configs (server, client, vault)
# lib/init/nomad/** — cluster-up / install / systemd / vault-init / # lib/init/nomad/** — cluster-up / install / systemd / vault-init
# vault-nomad-auth (S2.6 trigger: vault-*.sh
# is a subset of this glob)
# bin/disinto — `disinto init --backend=nomad` dispatcher # bin/disinto — `disinto init --backend=nomad` dispatcher
# tests/disinto-init-nomad.bats — the bats suite itself # tests/disinto-init-nomad.bats — the bats suite itself
# vault/policies/** — Vault ACL policy HCL files (S2.1, S2.6)
# vault/roles.yaml — JWT-auth role bindings (S2.3, S2.6)
# .woodpecker/nomad-validate.yml — the pipeline definition # .woodpecker/nomad-validate.yml — the pipeline definition
# #
# Steps (all fail-closed — any error blocks merge): # Steps (all fail-closed — any error blocks merge):
@ -24,22 +19,8 @@
# nomad/jobs/*.hcl (new jobspecs get # nomad/jobs/*.hcl (new jobspecs get
# CI coverage automatically) # CI coverage automatically)
# 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl # 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl
# 4. vault-policy-fmt — `vault policy fmt` idempotence check on # 4. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto
# every vault/policies/*.hcl (format drift = # 5. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests
# CI fail; non-destructive via cp+diff)
# 5. vault-policy-validate — HCL syntax + capability validation for every
# vault/policies/*.hcl via `vault policy write`
# against an inline dev-mode Vault server
# 6. vault-roles-validate — yamllint + role→policy reference check on
# vault/roles.yaml (every referenced policy
# must exist as vault/policies/<name>.hcl)
# 7. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto
# 8. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests
#
# Secret-scan coverage: vault/policies/*.hcl is already scanned by the
# P11 gate (.woodpecker/secret-scan.yml, issue #798) — its trigger path
# `vault/**/*` covers everything under this directory. We intentionally
# do NOT duplicate that gate here; one scanner, one source of truth.
# #
# Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 / # Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 /
# vault 1.18.5). Bump there AND here together — drift = CI passing on # vault 1.18.5). Bump there AND here together — drift = CI passing on
@ -53,8 +34,6 @@ when:
- "lib/init/nomad/**" - "lib/init/nomad/**"
- "bin/disinto" - "bin/disinto"
- "tests/disinto-init-nomad.bats" - "tests/disinto-init-nomad.bats"
- "vault/policies/**"
- "vault/roles.yaml"
- ".woodpecker/nomad-validate.yml" - ".woodpecker/nomad-validate.yml"
# Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is # Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is
@ -144,176 +123,7 @@ steps:
*) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;; *) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;;
esac esac
# ── 4. Vault policy fmt idempotence check ──────────────────────────────── # ── 4. Shellcheck ────────────────────────────────────────────────────────
# `vault policy fmt <file>` formats a local HCL policy file in place.
# There's no `-check`/dry-run flag (vault 1.18.5), so we implement a
# non-destructive check as cp → fmt-on-copy → diff against original.
# Any diff means the committed file would be rewritten by `vault policy
# fmt` — failure steers the author to run `vault policy fmt <file>`
# locally before pushing.
#
# Scope: vault/policies/*.hcl only. The `[ -f "$f" ]` guard handles the
# no-match case (POSIX sh does not nullglob) so an empty policies/
# directory does not fail this step.
#
# Note: `vault policy fmt` is purely local (HCL text transform) and does
# not require a running Vault server, which is why this step can run
# without starting one.
- name: vault-policy-fmt
image: hashicorp/vault:1.18.5
commands:
- |
set -e
failed=0
for f in vault/policies/*.hcl; do
[ -f "$f" ] || continue
tmp="/tmp/$(basename "$f").fmt"
cp "$f" "$tmp"
vault policy fmt "$tmp" >/dev/null 2>&1
if ! diff -u "$f" "$tmp"; then
echo "ERROR: $f is not formatted — run 'vault policy fmt $f' locally" >&2
failed=1
fi
done
if [ "$failed" -gt 0 ]; then
echo "vault-policy-fmt: formatting drift detected" >&2
exit 1
fi
echo "vault-policy-fmt: all policies formatted correctly"
# ── 5. Vault policy HCL syntax + capability validation ───────────────────
# Vault has no offline `vault policy validate` subcommand — the closest
# in-CLI validator is `vault policy write`, which sends the HCL to a
# running server which parses it, checks capability names against the
# known set (read, list, create, update, delete, patch, sudo, deny),
# and rejects unknown stanzas / malformed path blocks. We start an
# inline dev-mode Vault (in-memory, no persistence, root token = "root")
# for the duration of this step and loop `vault policy write` over every
# vault/policies/*.hcl; the policies never leave the ephemeral dev
# server, so this is strictly a validator — not a deploy.
#
# Exit-code handling:
# - `vault policy write` exits 0 on success, non-zero on any parse /
# semantic error. We aggregate failures across all files so a single
# CI run surfaces every broken policy (not just the first).
# - The dev server is killed on any step exit via EXIT trap so the
# step tears down cleanly even on failure.
#
# Why dev-mode is sufficient: we're not persisting secrets, only asking
# Vault to parse policy text. The factory's production Vault is NOT
# contacted.
- name: vault-policy-validate
image: hashicorp/vault:1.18.5
commands:
- |
set -e
vault server -dev -dev-root-token-id=root -dev-listen-address=127.0.0.1:8200 >/tmp/vault-dev.log 2>&1 &
VAULT_PID=$!
trap 'kill "$VAULT_PID" 2>/dev/null || true' EXIT INT TERM
export VAULT_ADDR=http://127.0.0.1:8200
export VAULT_TOKEN=root
ready=0
i=0
while [ "$i" -lt 30 ]; do
if vault status >/dev/null 2>&1; then
ready=1
break
fi
i=$((i + 1))
sleep 0.5
done
if [ "$ready" -ne 1 ]; then
echo "vault-policy-validate: dev server failed to start after 15s" >&2
cat /tmp/vault-dev.log >&2 || true
exit 1
fi
failed=0
for f in vault/policies/*.hcl; do
[ -f "$f" ] || continue
name=$(basename "$f" .hcl)
echo "validate: $f"
if ! vault policy write "$name" "$f"; then
echo " ERROR: $f failed validation" >&2
failed=1
fi
done
if [ "$failed" -gt 0 ]; then
echo "vault-policy-validate: validation errors found" >&2
exit 1
fi
echo "vault-policy-validate: all policies valid"
# ── 6. vault/roles.yaml validator ────────────────────────────────────────
# Validates the JWT-auth role bindings file (S2.3). Two checks:
#
# a. `yamllint` — catches YAML syntax errors and indentation drift.
# Uses a relaxed config (line length bumped to 200) because
# roles.yaml's comments are wide by design.
# b. role → policy reference check — every role's `policy:` field
# must match a basename in vault/policies/*.hcl. A role pointing
# at a non-existent policy = runtime "permission denied" at job
# placement; catching the drift here turns it into a CI failure.
# Also verifies each role entry has the four required fields
# (name, policy, namespace, job_id) per the file's documented
# format.
#
# Parsing is done with PyYAML (the roles.yaml format is a strict
# subset that awk-level parsing in tools/vault-apply-roles.sh handles
# too, but PyYAML in CI gives us structural validation for free). If
# roles.yaml is ever absent (e.g. reverted), the step skips rather
# than fails — presence is enforced by S2.3's own tooling, not here.
- name: vault-roles-validate
image: python:3.12-alpine
commands:
- pip install --quiet --disable-pip-version-check pyyaml yamllint
- |
set -e
if [ ! -f vault/roles.yaml ]; then
echo "vault-roles-validate: vault/roles.yaml not present, skipping"
exit 0
fi
yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}' vault/roles.yaml
echo "vault-roles-validate: yamllint OK"
python3 - <<'PY'
import os
import sys
import yaml
with open('vault/roles.yaml') as f:
data = yaml.safe_load(f) or {}
roles = data.get('roles') or []
if not roles:
print("vault-roles-validate: no roles defined in vault/roles.yaml", file=sys.stderr)
sys.exit(1)
existing = {
os.path.splitext(e)[0]
for e in os.listdir('vault/policies')
if e.endswith('.hcl')
}
required = ('name', 'policy', 'namespace', 'job_id')
failed = 0
for r in roles:
if not isinstance(r, dict):
print(f"ERROR: role entry is not a mapping: {r!r}", file=sys.stderr)
failed = 1
continue
for field in required:
if r.get(field) in (None, ''):
print(f"ERROR: role entry missing required field '{field}': {r}", file=sys.stderr)
failed = 1
policy = r.get('policy')
if policy and policy not in existing:
print(
f"ERROR: role '{r.get('name')}' references policy '{policy}' "
f"but vault/policies/{policy}.hcl does not exist",
file=sys.stderr,
)
failed = 1
sys.exit(failed)
PY
echo "vault-roles-validate: all role→policy references valid"
# ── 7. Shellcheck ────────────────────────────────────────────────────────
# Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns # Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns
# the backend dispatcher). bin/disinto has no .sh extension so the # the backend dispatcher). bin/disinto has no .sh extension so the
# repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the # repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the
@ -323,7 +133,7 @@ steps:
commands: commands:
- shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto - shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto
# ── 8. bats: `disinto init --backend=nomad --dry-run` ──────────────────── # ── 5. bats: `disinto init --backend=nomad --dry-run` ────────────────────
# Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0 # Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0
# with the expected step list, and --backend=docker stays on the docker # with the expected step list, and --backend=docker stays on the docker
# path (regression guard). Pure dry-run — no sudo, no network. # path (regression guard). Pure dry-run — no sudo, no network.

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# Disinto — Agent Instructions # Disinto — Agent Instructions
## What this repo is ## What this repo is
@ -37,20 +37,17 @@ disinto/ (code repo)
│ examples/ — example vault action TOMLs (promote, publish, release, webhook-call) │ examples/ — example vault action TOMLs (promote, publish, release, webhook-call)
├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh ├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh
│ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure) │ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure)
│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825); wp-oauth-register.sh (Forgejo OAuth2 app + Vault KV seeder for Woodpecker, S3.3) │ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825)
├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh ├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh
│ jobs/ — Nomad jobspecs: forgejo.hcl (Vault secrets via template, S2.4); woodpecker-server.hcl + woodpecker-agent.hcl (host-net, docker.sock, Vault KV, S3.1-S3.2)
├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored) ├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored)
├── formulas/ Issue templates (TOML specs for multi-step agent tasks) ├── formulas/ Issue templates (TOML specs for multi-step agent tasks)
├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/) ├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/)
├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh) ├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh)
│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh — Vault provisioning (S2.1/S2.2)
│ vault-seed-<svc>.sh — per-service Vault secret seeders; auto-invoked by `bin/disinto --with <svc>` (add a new file to support a new service)
├── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md) ├── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md)
├── site/ disinto.ai website content ├── site/ disinto.ai website content
├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, lib-generators.bats, vault-import.bats, disinto-init-nomad.bats) ├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, disinto-init-nomad.bats)
├── templates/ Issue templates ├── templates/ Issue templates
├── bin/ The `disinto` CLI script (`--with <svc>` deploys services + runs their Vault seeders) ├── bin/ The `disinto` CLI script
├── disinto-factory/ Setup documentation and skill ├── disinto-factory/ Setup documentation and skill
├── state/ Runtime state ├── state/ Runtime state
├── .woodpecker/ Woodpecker CI pipeline configs ├── .woodpecker/ Woodpecker CI pipeline configs
@ -123,7 +120,8 @@ bash dev/phase-test.sh
| Reproduce | `docker/reproduce/` | Bug reproduction using Playwright MCP | `formulas/reproduce.toml` | | Reproduce | `docker/reproduce/` | Bug reproduction using Playwright MCP | `formulas/reproduce.toml` |
| Triage | `docker/reproduce/` | Deep root cause analysis | `formulas/triage.toml` | | Triage | `docker/reproduce/` | Deep root cause analysis | `formulas/triage.toml` |
| Edge dispatcher | `docker/edge/` | Polls ops repo for vault actions, executes via Claude sessions | `docker/edge/dispatcher.sh` | | Edge dispatcher | `docker/edge/` | Polls ops repo for vault actions, executes via Claude sessions | `docker/edge/dispatcher.sh` |
| Local-model agents | `docker/agents/` (same image) | Local llama-server agents configured via `[agents.X]` sections in project TOML | [docs/agents-llama.md](docs/agents-llama.md) | | agents-llama | `docker/agents/` (same image) | Local-Qwen dev agent (`AGENT_ROLES=dev`), gated on `ENABLE_LLAMA_AGENT=1` | [docs/agents-llama.md](docs/agents-llama.md) |
| agents-llama-all | `docker/agents/` (same image) | Local-Qwen all-roles agent (all 7 roles), profile `agents-llama-all` | [docs/agents-llama.md](docs/agents-llama.md) |
> **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77). > **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77).
> See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow details. > See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow details.
@ -194,7 +192,9 @@ Humans write these. Agents read and enforce them.
## Phase-Signaling Protocol ## Phase-Signaling Protocol
When running as a persistent tmux session, Claude must signal the orchestrator at each phase boundary by writing to a phase file (e.g. `/tmp/dev-session-{project}-{issue}.phase`). When running as a persistent tmux session, Claude must signal the orchestrator
at each phase boundary by writing to a phase file (e.g.
`/tmp/dev-session-{project}-{issue}.phase`).
Key phases: `PHASE:awaiting_ci``PHASE:awaiting_review``PHASE:done`. Also: `PHASE:escalate` (needs human input), `PHASE:failed`. Key phases: `PHASE:awaiting_ci``PHASE:awaiting_review``PHASE:done`. Also: `PHASE:escalate` (needs human input), `PHASE:failed`.
See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec, orchestrator reaction matrix, sequence diagram, and crash recovery. See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec, orchestrator reaction matrix, sequence diagram, and crash recovery.

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# Architect — Agent Instructions # Architect — Agent Instructions
## What this agent is ## What this agent is

View file

@ -82,16 +82,13 @@ Init options:
--ci-id <n> Woodpecker CI repo ID (default: 0 = no CI) --ci-id <n> Woodpecker CI repo ID (default: 0 = no CI)
--forge-url <url> Forge base URL (default: http://localhost:3000) --forge-url <url> Forge base URL (default: http://localhost:3000)
--backend <value> Orchestration backend: docker (default) | nomad --backend <value> Orchestration backend: docker (default) | nomad
--with <services> (nomad) Deploy services: forgejo,woodpecker[,...] (S1.3, S3.4) --with <services> (nomad) Deploy services: forgejo[,...] (S1.3)
--empty (nomad) Bring up cluster only, no jobs (S0.4) --empty (nomad) Bring up cluster only, no jobs (S0.4)
--bare Skip compose generation (bare-metal setup) --bare Skip compose generation (bare-metal setup)
--build Use local docker build instead of registry images (dev mode) --build Use local docker build instead of registry images (dev mode)
--yes Skip confirmation prompts --yes Skip confirmation prompts
--rotate-tokens Force regeneration of all bot tokens/passwords (idempotent by default) --rotate-tokens Force regeneration of all bot tokens/passwords (idempotent by default)
--dry-run Print every intended action without executing --dry-run Print every intended action without executing
--import-env <path> (nomad) Path to .env file for import into Vault KV (S2.5)
--import-sops <path> (nomad) Path to sops-encrypted .env.vault.enc for import (S2.5)
--age-key <path> (nomad) Path to age keyfile (required with --import-sops) (S2.5)
Hire an agent options: Hire an agent options:
--formula <path> Path to role formula TOML (default: formulas/<role>.toml) --formula <path> Path to role formula TOML (default: formulas/<role>.toml)
@ -667,13 +664,8 @@ prompt_admin_password() {
# `sudo disinto init ...` directly. # `sudo disinto init ...` directly.
_disinto_init_nomad() { _disinto_init_nomad() {
local dry_run="${1:-false}" empty="${2:-false}" with_services="${3:-}" local dry_run="${1:-false}" empty="${2:-false}" with_services="${3:-}"
local import_env="${4:-}" import_sops="${5:-}" age_key="${6:-}"
local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh" local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh"
local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh" local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh"
local vault_engines_sh="${FACTORY_ROOT}/lib/init/nomad/vault-engines.sh"
local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh"
local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh"
local vault_import_sh="${FACTORY_ROOT}/tools/vault-import.sh"
if [ ! -x "$cluster_up" ]; then if [ ! -x "$cluster_up" ]; then
echo "Error: ${cluster_up} not found or not executable" >&2 echo "Error: ${cluster_up} not found or not executable" >&2
@ -685,42 +677,6 @@ _disinto_init_nomad() {
exit 1 exit 1
fi fi
# --empty short-circuits after cluster-up: no policies, no auth, no
# import, no deploy. It's the "cluster-only escape hatch" for debugging
# (docs/nomad-migration.md). Caller-side validation already rejects
# --empty combined with --with or any --import-* flag, so reaching
# this branch with those set is a bug in the caller.
#
# On the default (non-empty) path, vault-engines.sh (enables the kv/
# mount), vault-apply-policies.sh, and vault-nomad-auth.sh are invoked
# unconditionally — they are idempotent and cheap to re-run, and
# subsequent --with deployments depend on them. vault-import.sh is
# invoked only when an --import-* flag is set. vault-engines.sh runs
# first because every policy and role below references kv/disinto/*
# paths, which 403 if the engine is not yet mounted (issue #912).
local import_any=false
if [ -n "$import_env" ] || [ -n "$import_sops" ]; then
import_any=true
fi
if [ "$empty" != "true" ]; then
if [ ! -x "$vault_engines_sh" ]; then
echo "Error: ${vault_engines_sh} not found or not executable" >&2
exit 1
fi
if [ ! -x "$vault_policies_sh" ]; then
echo "Error: ${vault_policies_sh} not found or not executable" >&2
exit 1
fi
if [ ! -x "$vault_auth_sh" ]; then
echo "Error: ${vault_auth_sh} not found or not executable" >&2
exit 1
fi
if [ "$import_any" = true ] && [ ! -x "$vault_import_sh" ]; then
echo "Error: ${vault_import_sh} not found or not executable" >&2
exit 1
fi
fi
# --empty and default both invoke cluster-up today. Log the requested # --empty and default both invoke cluster-up today. Log the requested
# mode so the dispatch is visible in factory bootstrap logs — Step 1 # mode so the dispatch is visible in factory bootstrap logs — Step 1
# will branch on $empty to gate the job-deployment path. # will branch on $empty to gate the job-deployment path.
@ -730,7 +686,7 @@ _disinto_init_nomad() {
echo "nomad backend: default (cluster-up; jobs deferred to Step 1)" echo "nomad backend: default (cluster-up; jobs deferred to Step 1)"
fi fi
# Dry-run: print cluster-up plan + policies/auth/import plan + deploy.sh plan # Dry-run: print cluster-up plan + deploy.sh plan
if [ "$dry_run" = "true" ]; then if [ "$dry_run" = "true" ]; then
echo "" echo ""
echo "── Cluster-up dry-run ─────────────────────────────────" echo "── Cluster-up dry-run ─────────────────────────────────"
@ -738,94 +694,20 @@ _disinto_init_nomad() {
"${cmd[@]}" || true "${cmd[@]}" || true
echo "" echo ""
# --empty skips policies/auth/import/deploy — cluster-up only, no
# workloads. The operator-visible dry-run plan must match the real
# run, so short-circuit here too.
if [ "$empty" = "true" ]; then
exit 0
fi
# Vault engines + policies + auth are invoked on every nomad real-run
# path regardless of --import-* flags (they're idempotent; S2.1 + S2.3).
# Engines runs first because policies/roles/templates all reference the
# kv/ mount it enables (issue #912). Mirror that ordering in the
# dry-run plan so the operator sees the full sequence Step 2 will
# execute.
echo "── Vault engines dry-run ──────────────────────────────"
echo "[engines] [dry-run] ${vault_engines_sh} --dry-run"
echo ""
echo "── Vault policies dry-run ─────────────────────────────"
echo "[policies] [dry-run] ${vault_policies_sh} --dry-run"
echo ""
echo "── Vault auth dry-run ─────────────────────────────────"
echo "[auth] [dry-run] ${vault_auth_sh}"
echo ""
# Import plan: one line per --import-* flag that is actually set.
# Printing independently (not in an if/elif chain) means that all
# three flags appearing together each echo their own path — the
# regression that bit prior implementations of this issue (#883).
if [ "$import_any" = true ]; then
echo "── Vault import dry-run ───────────────────────────────"
[ -n "$import_env" ] && echo "[import] --import-env env file: ${import_env}"
[ -n "$import_sops" ] && echo "[import] --import-sops sops file: ${import_sops}"
[ -n "$age_key" ] && echo "[import] --age-key age key: ${age_key}"
local -a import_dry_cmd=("$vault_import_sh")
[ -n "$import_env" ] && import_dry_cmd+=("--env" "$import_env")
[ -n "$import_sops" ] && import_dry_cmd+=("--sops" "$import_sops")
[ -n "$age_key" ] && import_dry_cmd+=("--age-key" "$age_key")
import_dry_cmd+=("--dry-run")
echo "[import] [dry-run] ${import_dry_cmd[*]}"
echo ""
else
echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services"
echo ""
fi
if [ -n "$with_services" ]; then if [ -n "$with_services" ]; then
# Vault seed plan (S2.6, #928): one line per service whose echo "── Deploy services dry-run ────────────────────────────"
# tools/vault-seed-<svc>.sh ships. Sub-services (woodpecker-server, echo "[deploy] services to deploy: ${with_services}"
# woodpecker-agent) map to their parent seeder (vault-seed-woodpecker.sh).
# Deduplicated so the seeder runs once even when both sub-services
# are present.
local seed_hdr_printed=false
local _seed_seen=""
local IFS=',' local IFS=','
for svc in $with_services; do for svc in $with_services; do
svc=$(echo "$svc" | xargs) # trim whitespace svc=$(echo "$svc" | xargs) # trim whitespace
# Map sub-services to parent seed name # Validate known services first
local seed_name="$svc"
case "$svc" in case "$svc" in
woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; forgejo) ;;
*)
echo "Error: unknown service '${svc}' — known: forgejo" >&2
exit 1
;;
esac esac
# Deduplicate
if echo ",$_seed_seen," | grep -q ",$seed_name,"; then continue; fi
_seed_seen="${_seed_seen:+${_seed_seen},}${seed_name}"
local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh"
if [ -x "$seed_script" ]; then
if [ "$seed_hdr_printed" = false ]; then
echo "── Vault seed dry-run ─────────────────────────────────"
seed_hdr_printed=true
fi
echo "[seed] [dry-run] ${seed_script} --dry-run"
fi
done
[ "$seed_hdr_printed" = true ] && echo ""
echo "── Deploy services dry-run ────────────────────────────"
echo "[deploy] services to deploy: ${with_services}"
# Build ordered deploy list: only include services present in with_services
local DEPLOY_ORDER=""
for ordered_svc in forgejo woodpecker-server woodpecker-agent; do
if echo ",$with_services," | grep -q ",$ordered_svc,"; then
DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}"
fi
done
echo "[deploy] deployment order: ${DEPLOY_ORDER}"
local IFS=' '
for svc in $DEPLOY_ORDER; do
local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl"
if [ ! -f "$jobspec_path" ]; then if [ ! -f "$jobspec_path" ]; then
echo "Error: jobspec not found: ${jobspec_path}" >&2 echo "Error: jobspec not found: ${jobspec_path}" >&2
@ -839,7 +721,7 @@ _disinto_init_nomad() {
exit 0 exit 0
fi fi
# Real run: cluster-up + policies + auth + (optional) import + deploy # Real run: cluster-up + deploy services
local -a cluster_cmd=("$cluster_up") local -a cluster_cmd=("$cluster_up")
if [ "$(id -u)" -eq 0 ]; then if [ "$(id -u)" -eq 0 ]; then
"${cluster_cmd[@]}" || exit $? "${cluster_cmd[@]}" || exit $?
@ -851,147 +733,27 @@ _disinto_init_nomad() {
sudo -n -- "${cluster_cmd[@]}" || exit $? sudo -n -- "${cluster_cmd[@]}" || exit $?
fi fi
# --empty short-circuits here: cluster-up only, no policies/auth/import
# and no deploy. Matches the dry-run plan above and the docs/runbook.
if [ "$empty" = "true" ]; then
exit 0
fi
# Enable Vault secret engines (S2.1 / issue #912) — must precede
# policies/auth/import because every policy and every import target
# addresses paths under kv/. Idempotent, safe to re-run.
echo ""
echo "── Enabling Vault secret engines ──────────────────────"
local -a engines_cmd=("$vault_engines_sh")
if [ "$(id -u)" -eq 0 ]; then
"${engines_cmd[@]}" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: vault-engines.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${engines_cmd[@]}" || exit $?
fi
# Apply Vault policies (S2.1) — idempotent, safe to re-run.
echo ""
echo "── Applying Vault policies ────────────────────────────"
local -a policies_cmd=("$vault_policies_sh")
if [ "$(id -u)" -eq 0 ]; then
"${policies_cmd[@]}" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: vault-apply-policies.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${policies_cmd[@]}" || exit $?
fi
# Configure Vault JWT auth + Nomad workload identity (S2.3) — idempotent.
echo ""
echo "── Configuring Vault JWT auth ─────────────────────────"
local -a auth_cmd=("$vault_auth_sh")
if [ "$(id -u)" -eq 0 ]; then
"${auth_cmd[@]}" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: vault-nomad-auth.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${auth_cmd[@]}" || exit $?
fi
# Import secrets if any --import-* flag is set (S2.2).
if [ "$import_any" = true ]; then
echo ""
echo "── Importing secrets into Vault ───────────────────────"
local -a import_cmd=("$vault_import_sh")
[ -n "$import_env" ] && import_cmd+=("--env" "$import_env")
[ -n "$import_sops" ] && import_cmd+=("--sops" "$import_sops")
[ -n "$age_key" ] && import_cmd+=("--age-key" "$age_key")
if [ "$(id -u)" -eq 0 ]; then
"${import_cmd[@]}" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: vault-import.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${import_cmd[@]}" || exit $?
fi
else
echo ""
echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services"
fi
# Seed Vault for services that ship their own seeder (S2.6, #928).
# Convention: tools/vault-seed-<svc>.sh — auto-invoked when --with <svc>
# is requested. Runs AFTER vault-import so that real imported values
# win over generated seeds when both are present; each seeder is
# idempotent on a per-key basis (see vault-seed-forgejo.sh's
# "missing → generate, present → unchanged" contract), so re-running
# init does not rotate existing keys. Services without a seeder are
# silently skipped — keeps this loop forward-compatible with Step 3+
# services that may ship their own seeder without touching bin/disinto.
#
# VAULT_ADDR is passed explicitly because cluster-up.sh writes the
# profile.d export *during* this same init run, so the current shell
# hasn't sourced it yet; sibling vault-* scripts (engines/policies/
# auth/import) default VAULT_ADDR internally via _hvault_default_env,
# but vault-seed-forgejo.sh requires the caller to set it.
#
# The non-root branch invokes the seeder as `sudo -n -- env VAR=val
# script` rather than `sudo -n VAR=val -- script`: sudo treats bare
# `VAR=val` args as sudoers env-assignments, which the default
# `env_reset=on` policy silently discards unless the variable is in
# `env_keep` (VAULT_ADDR is not). Using `env` as the actual command
# sets VAULT_ADDR in the child process regardless of sudoers policy.
if [ -n "$with_services" ]; then
local vault_addr="${VAULT_ADDR:-http://127.0.0.1:8200}"
local _seed_seen=""
local IFS=','
for svc in $with_services; do
svc=$(echo "$svc" | xargs) # trim whitespace
# Map sub-services to parent seed name (S3.4)
local seed_name="$svc"
case "$svc" in
woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;;
esac
# Deduplicate
if echo ",$_seed_seen," | grep -q ",$seed_name,"; then continue; fi
_seed_seen="${_seed_seen:+${_seed_seen},}${seed_name}"
local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh"
if [ -x "$seed_script" ]; then
echo ""
echo "── Seeding Vault for ${seed_name} ───────────────────────────"
if [ "$(id -u)" -eq 0 ]; then
VAULT_ADDR="$vault_addr" "$seed_script" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: vault-seed-${seed_name}.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- env "VAULT_ADDR=$vault_addr" "$seed_script" || exit $?
fi
fi
done
fi
# Deploy services if requested # Deploy services if requested
if [ -n "$with_services" ]; then if [ -n "$with_services" ]; then
echo "" echo ""
echo "── Deploying services ─────────────────────────────────" echo "── Deploying services ─────────────────────────────────"
# Build ordered deploy list (S3.4): forgejo → woodpecker-server → woodpecker-agent
local DEPLOY_ORDER=""
for ordered_svc in forgejo woodpecker-server woodpecker-agent; do
if echo ",$with_services," | grep -q ",$ordered_svc,"; then
DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}"
fi
done
local -a deploy_cmd=("$deploy_sh") local -a deploy_cmd=("$deploy_sh")
local IFS=' ' # Split comma-separated service list into positional args
for svc in $DEPLOY_ORDER; do local IFS=','
for svc in $with_services; do
svc=$(echo "$svc" | xargs) # trim whitespace
if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then
echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2
exit 1
fi
# Validate known services FIRST (before jobspec check)
case "$svc" in
forgejo) ;;
*)
echo "Error: unknown service '${svc}' — known: forgejo" >&2
exit 1
;;
esac
# Check jobspec exists # Check jobspec exists
local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl"
if [ ! -f "$jobspec_path" ]; then if [ ! -f "$jobspec_path" ]; then
@ -1015,26 +777,10 @@ _disinto_init_nomad() {
echo "" echo ""
echo "── Summary ────────────────────────────────────────────" echo "── Summary ────────────────────────────────────────────"
echo "Cluster: Nomad+Vault cluster is up" echo "Cluster: Nomad+Vault cluster is up"
echo "Policies: applied (Vault ACL)"
echo "Auth: Vault JWT auth + Nomad workload identity configured"
if [ "$import_any" = true ]; then
local import_desc=""
[ -n "$import_env" ] && import_desc+="${import_env} "
[ -n "$import_sops" ] && import_desc+="${import_sops} "
echo "Imported: ${import_desc% }"
else
echo "Imported: (none — seed kv/disinto/* manually before deploying secret-dependent services)"
fi
echo "Deployed: ${with_services}" echo "Deployed: ${with_services}"
if echo ",$with_services," | grep -q ",forgejo,"; then if echo "$with_services" | grep -q "forgejo"; then
echo "Ports: forgejo: 3000" echo "Ports: forgejo: 3000"
fi fi
if echo ",$with_services," | grep -q ",woodpecker-server,"; then
echo " woodpecker-server: 8000"
fi
if echo ",$with_services," | grep -q ",woodpecker-agent,"; then
echo " woodpecker-agent: (agent connected)"
fi
echo "────────────────────────────────────────────────────────" echo "────────────────────────────────────────────────────────"
fi fi
@ -1057,7 +803,6 @@ disinto_init() {
# Parse flags # Parse flags
local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services="" local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services=""
local import_env="" import_sops="" age_key=""
while [ $# -gt 0 ]; do while [ $# -gt 0 ]; do
case "$1" in case "$1" in
--branch) branch="$2"; shift 2 ;; --branch) branch="$2"; shift 2 ;;
@ -1074,12 +819,6 @@ disinto_init() {
--yes) auto_yes=true; shift ;; --yes) auto_yes=true; shift ;;
--rotate-tokens) rotate_tokens=true; shift ;; --rotate-tokens) rotate_tokens=true; shift ;;
--dry-run) dry_run=true; shift ;; --dry-run) dry_run=true; shift ;;
--import-env) import_env="$2"; shift 2 ;;
--import-env=*) import_env="${1#--import-env=}"; shift ;;
--import-sops) import_sops="$2"; shift 2 ;;
--import-sops=*) import_sops="${1#--import-sops=}"; shift ;;
--age-key) age_key="$2"; shift 2 ;;
--age-key=*) age_key="${1#--age-key=}"; shift ;;
*) echo "Unknown option: $1" >&2; exit 1 ;; *) echo "Unknown option: $1" >&2; exit 1 ;;
esac esac
done done
@ -1120,80 +859,11 @@ disinto_init() {
exit 1 exit 1
fi fi
# Normalize --with services (S3.4): expand 'woodpecker' shorthand to
# 'woodpecker-server,woodpecker-agent', auto-include forgejo when
# woodpecker is requested (OAuth dependency), and validate all names.
if [ -n "$with_services" ]; then
# Expand 'woodpecker' (bare) → 'woodpecker-server,woodpecker-agent'.
# Must not match already-expanded 'woodpecker-server'/'woodpecker-agent'.
local expanded=""
local IFS=','
for _svc in $with_services; do
_svc=$(echo "$_svc" | xargs)
case "$_svc" in
woodpecker) _svc="woodpecker-server,woodpecker-agent" ;;
esac
expanded="${expanded:+${expanded},}${_svc}"
done
with_services="$expanded"
unset IFS
# Auto-include forgejo when woodpecker is requested
if echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent," \
&& ! echo ",$with_services," | grep -q ",forgejo,"; then
echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)"
with_services="forgejo,${with_services}"
fi
# Validate all service names are known
local IFS=','
for _svc in $with_services; do
_svc=$(echo "$_svc" | xargs)
case "$_svc" in
forgejo|woodpecker-server|woodpecker-agent) ;;
*)
echo "Error: unknown service '${_svc}' — known: forgejo, woodpecker-server, woodpecker-agent" >&2
exit 1
;;
esac
done
unset IFS
fi
# --import-* flag validation (S2.5). These three flags form an import
# triple and must be consistent before dispatch: sops encryption is
# useless without the age key to decrypt it, so either both --import-sops
# and --age-key are present or neither is. --import-env alone is fine
# (it just imports the plaintext dotenv). All three flags are nomad-only.
if [ -n "$import_sops" ] && [ -z "$age_key" ]; then
echo "Error: --import-sops requires --age-key" >&2
exit 1
fi
if [ -n "$age_key" ] && [ -z "$import_sops" ]; then
echo "Error: --age-key requires --import-sops" >&2
exit 1
fi
if { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; } \
&& [ "$backend" != "nomad" ]; then
echo "Error: --import-env, --import-sops, and --age-key require --backend=nomad" >&2
exit 1
fi
# --empty is the cluster-only escape hatch — it skips policies, auth,
# import, and deploy. Pairing it with --import-* silently does nothing,
# which is a worse failure mode than a clear error. Reject explicitly.
if [ "$empty" = true ] \
&& { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; }; then
echo "Error: --empty and --import-env/--import-sops/--age-key are mutually exclusive" >&2
exit 1
fi
# Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh # Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh
# (S0.4). The default and --empty variants are identical today; Step 1 # (S0.4). The default and --empty variants are identical today; Step 1
# will branch on $empty to add job deployment to the default path. # will branch on $empty to add job deployment to the default path.
if [ "$backend" = "nomad" ]; then if [ "$backend" = "nomad" ]; then
_disinto_init_nomad "$dry_run" "$empty" "$with_services" \ _disinto_init_nomad "$dry_run" "$empty" "$with_services"
"$import_env" "$import_sops" "$age_key"
# shellcheck disable=SC2317 # _disinto_init_nomad always exits today; # shellcheck disable=SC2317 # _disinto_init_nomad always exits today;
# `return` is defensive against future refactors. # `return` is defensive against future refactors.
return return
@ -1307,6 +977,7 @@ p.write_text(text)
echo "" echo ""
echo "[ensure] Forgejo admin user 'disinto-admin'" echo "[ensure] Forgejo admin user 'disinto-admin'"
echo "[ensure] 8 bot users: dev-bot, review-bot, planner-bot, gardener-bot, vault-bot, supervisor-bot, predictor-bot, architect-bot" echo "[ensure] 8 bot users: dev-bot, review-bot, planner-bot, gardener-bot, vault-bot, supervisor-bot, predictor-bot, architect-bot"
echo "[ensure] 2 llama bot users: dev-qwen, dev-qwen-nightly"
echo "[ensure] .profile repos for all bots" echo "[ensure] .profile repos for all bots"
echo "[ensure] repo ${forge_repo} on Forgejo with collaborators" echo "[ensure] repo ${forge_repo} on Forgejo with collaborators"
echo "[run] preflight checks" echo "[run] preflight checks"
@ -1502,6 +1173,19 @@ p.write_text(text)
echo "Config: CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 saved to .env" echo "Config: CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 saved to .env"
fi fi
# Write local-Qwen dev agent env keys with safe defaults (#769)
if ! grep -q '^ENABLE_LLAMA_AGENT=' "$env_file" 2>/dev/null; then
cat >> "$env_file" <<'LLAMAENVEOF'
# Local Qwen dev agent (optional) — set to 1 to enable
ENABLE_LLAMA_AGENT=0
FORGE_TOKEN_LLAMA=
FORGE_PASS_LLAMA=
ANTHROPIC_BASE_URL=
LLAMAENVEOF
echo "Config: ENABLE_LLAMA_AGENT keys written to .env (disabled by default)"
fi
# Create labels on remote # Create labels on remote
create_labels "$forge_repo" "$forge_url" create_labels "$forge_repo" "$forge_url"

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# Dev Agent # Dev Agent
**Role**: Implement issues autonomously — write code, push branches, address **Role**: Implement issues autonomously — write code, push branches, address

View file

@ -2,7 +2,7 @@ FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \
&& pip3 install --break-system-packages networkx tomlkit \ && pip3 install --break-system-packages networkx \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Pre-built binaries (copied from docker/agents/bin/) # Pre-built binaries (copied from docker/agents/bin/)

View file

@ -17,38 +17,6 @@ set -euo pipefail
# - predictor: every 24 hours (288 iterations * 5 min) # - predictor: every 24 hours (288 iterations * 5 min)
# - supervisor: every SUPERVISOR_INTERVAL seconds (default: 1200 = 20 min) # - supervisor: every SUPERVISOR_INTERVAL seconds (default: 1200 = 20 min)
# ── Migration check: reject ENABLE_LLAMA_AGENT ───────────────────────────────
# #846: The legacy ENABLE_LLAMA_AGENT env flag is no longer supported.
# Activation is now done exclusively via [agents.X] sections in project TOML.
# If this legacy flag is detected, fail immediately with a migration message.
if [ "${ENABLE_LLAMA_AGENT:-}" = "1" ]; then
cat <<'MIGRATION_ERR'
FATAL: ENABLE_LLAMA_AGENT is no longer supported.
The legacy ENABLE_LLAMA_AGENT=1 flag has been removed (#846).
Activation is now done exclusively via [agents.X] sections in projects/*.toml.
To migrate:
1. Remove ENABLE_LLAMA_AGENT from your .env or .env.enc file
2. Add an [agents.<name>] section to your project TOML:
[agents.dev-qwen]
base_url = "http://your-llama-server:8081"
model = "unsloth/Qwen3.5-35B-A3B"
api_key = "sk-no-key-required"
roles = ["dev"]
forge_user = "dev-qwen"
compact_pct = 60
poll_interval = 60
3. Run: disinto init
4. Start the agent: docker compose up -d agents-dev-qwen
See docs/agents-llama.md for full details.
MIGRATION_ERR
exit 1
fi
DISINTO_BAKED="/home/agent/disinto" DISINTO_BAKED="/home/agent/disinto"
DISINTO_LIVE="/home/agent/repos/_factory" DISINTO_LIVE="/home/agent/repos/_factory"
DISINTO_DIR="$DISINTO_BAKED" # start with baked copy; switched to live checkout after bootstrap DISINTO_DIR="$DISINTO_BAKED" # start with baked copy; switched to live checkout after bootstrap
@ -374,32 +342,9 @@ bootstrap_ops_repos
# Bootstrap factory repo — switch DISINTO_DIR to live checkout (#593) # Bootstrap factory repo — switch DISINTO_DIR to live checkout (#593)
bootstrap_factory_repo bootstrap_factory_repo
# Validate that projects directory has at least one real .toml file (not .example)
# This prevents the silent-zombie mode where the polling loop matches zero files
# and does nothing forever.
validate_projects_dir() {
# NOTE: compgen -G exits non-zero when no matches exist, so piping it through
# `wc -l` under `set -eo pipefail` aborts the script before the FATAL branch
# can log a diagnostic (#877). Use the conditional form already adopted at
# lines above (see bootstrap_factory_repo, PROJECT_NAME parsing).
if ! compgen -G "${DISINTO_DIR}/projects/*.toml" >/dev/null 2>&1; then
log "FATAL: No real .toml files found in ${DISINTO_DIR}/projects/"
log "Expected at least one project config file (e.g., disinto.toml)"
log "The directory only contains *.toml.example template files."
log "Mount the host ./projects volume or copy real .toml files into the container."
exit 1
fi
local toml_count
toml_count=$(compgen -G "${DISINTO_DIR}/projects/*.toml" | wc -l)
log "Projects directory validated: ${toml_count} real .toml file(s) found"
}
# Initialize state directory for check_active guards # Initialize state directory for check_active guards
init_state_dir init_state_dir
# Validate projects directory before entering polling loop
validate_projects_dir
# Parse AGENT_ROLES env var (default: all agents) # Parse AGENT_ROLES env var (default: all agents)
# Expected format: comma-separated list like "review,dev,gardener" # Expected format: comma-separated list like "review,dev,gardener"
AGENT_ROLES="${AGENT_ROLES:-review,dev,gardener,architect,planner,predictor,supervisor}" AGENT_ROLES="${AGENT_ROLES:-review,dev,gardener,architect,planner,predictor,supervisor}"

View file

@ -2,12 +2,9 @@
Local-model agents run the same agent code as the Claude-backed agents, but Local-model agents run the same agent code as the Claude-backed agents, but
connect to a local llama-server (or compatible OpenAI-API endpoint) instead of connect to a local llama-server (or compatible OpenAI-API endpoint) instead of
the Anthropic API. This document describes the canonical activation flow using the Anthropic API. This document describes the current activation flow using
`disinto hire-an-agent` and `[agents.X]` TOML configuration. `disinto hire-an-agent` and `[agents.X]` TOML configuration.
> **Note:** The legacy `ENABLE_LLAMA_AGENT=1` env flag has been removed (#846).
> Activation is now done exclusively via `[agents.X]` sections in project TOML.
## Overview ## Overview
Local-model agents are configured via `[agents.<name>]` sections in Local-model agents are configured via `[agents.<name>]` sections in

View file

@ -1,124 +0,0 @@
<!-- last-reviewed: (new file, S2.5 #883) -->
# Nomad+Vault migration — cutover-day runbook
`disinto init --backend=nomad` is the single entry-point that turns a fresh
LXC (with the disinto repo cloned) into a running Nomad+Vault cluster with
policies applied, JWT workload-identity auth configured, secrets imported
from the old docker stack, and services deployed.
## Cutover-day invocation
On the new LXC, as root (or an operator with NOPASSWD sudo):
```bash
# Copy the plaintext .env + sops-encrypted .env.vault.enc + age keyfile
# from the old box first (out of band — SSH, USB, whatever your ops
# procedure allows). Then:
sudo ./bin/disinto init \
--backend=nomad \
--import-env /tmp/.env \
--import-sops /tmp/.env.vault.enc \
--age-key /tmp/keys.txt \
--with forgejo
```
This runs, in order:
1. **`lib/init/nomad/cluster-up.sh`** (S0) — installs Nomad + Vault
binaries, writes `/etc/nomad.d/*`, initializes Vault, starts both
services, waits for the Nomad node to become ready.
2. **`tools/vault-apply-policies.sh`** (S2.1) — syncs every
`vault/policies/*.hcl` into Vault as an ACL policy. Idempotent.
3. **`lib/init/nomad/vault-nomad-auth.sh`** (S2.3) — enables Vault's
JWT auth method at `jwt-nomad`, points it at Nomad's JWKS, writes
one role per policy, reloads Nomad so jobs can exchange
workload-identity tokens for Vault tokens. Idempotent.
4. **`tools/vault-import.sh`** (S2.2) — reads `/tmp/.env` and the
sops-decrypted `/tmp/.env.vault.enc`, writes them to the KV paths
matching the S2.1 policy layout (`kv/disinto/bots/*`, `kv/disinto/shared/*`,
`kv/disinto/runner/*`). Idempotent (overwrites KV v2 data in place).
5. **`lib/init/nomad/deploy.sh forgejo`** (S1) — validates + runs the
`nomad/jobs/forgejo.hcl` jobspec. Forgejo reads its admin creds from
Vault via the `template` stanza (S2.4).
## Flag summary
| Flag | Meaning |
|---|---|
| `--backend=nomad` | Switch the init dispatcher to the Nomad+Vault path (instead of docker compose). |
| `--empty` | Bring the cluster up, skip policies/auth/import/deploy. Escape hatch for debugging. |
| `--with forgejo[,…]` | Deploy these services after the cluster is up. |
| `--import-env PATH` | Plaintext `.env` from the old stack. Optional. |
| `--import-sops PATH` | Sops-encrypted `.env.vault.enc` from the old stack. Requires `--age-key`. |
| `--age-key PATH` | Age keyfile used to decrypt `--import-sops`. Requires `--import-sops`. |
| `--dry-run` | Print the full plan (cluster-up + policies + auth + import + deploy) and exit. Touches nothing. |
### Flag validation
- `--import-sops` without `--age-key` → error.
- `--age-key` without `--import-sops` → error.
- `--import-env` alone (no sops) → OK (imports just the plaintext `.env`).
- `--backend=docker` with any `--import-*` flag → error.
- `--empty` with any `--import-*` flag → error (mutually exclusive: `--empty`
skips the import step, so pairing them silently discards the import
intent).
## Idempotency
Every layer is idempotent by design. Re-running the same command on an
already-provisioned box is a no-op at every step:
- **Cluster-up:** second run detects running `nomad`/`vault` systemd
units and state files, skips re-init.
- **Policies:** byte-for-byte compare against on-server policy text;
"unchanged" for every untouched file.
- **Auth:** skips auth-method create if `jwt-nomad/` already enabled,
skips config write if the JWKS + algs match, skips server.hcl write if
the file on disk is identical to the repo copy.
- **Import:** KV v2 writes overwrite in place (same path, same keys,
same values → no new version).
- **Deploy:** `nomad job run` is declarative; same jobspec → no new
allocation.
## Dry-run
```bash
./bin/disinto init --backend=nomad \
--import-env /tmp/.env \
--import-sops /tmp/.env.vault.enc \
--age-key /tmp/keys.txt \
--with forgejo \
--dry-run
```
Prints the five-section plan — cluster-up, policies, auth, import,
deploy — with every path and every argv that would be executed. No
network, no sudo, no state mutation. See
`tests/disinto-init-nomad.bats` for the exact output shape.
## No-import path
If you already have `kv/disinto/*` seeded by other means (manual
`vault kv put`, a replica, etc.), omit all three `--import-*` flags.
`disinto init --backend=nomad --with forgejo` still applies policies,
configures auth, and deploys — but skips the import step with:
```
[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services
```
Forgejo's template stanza will fail to render (and thus the allocation
will stall) until those KV paths exist — so either import them or seed
them first.
## Secret hygiene
- Never log a secret value. The CLI only prints paths (`--import-env`,
`--age-key`) and KV *paths* (`kv/disinto/bots/review/token`), never
the values themselves. `tools/vault-import.sh` is the only thing that
reads the values, and it pipes them directly into Vault's HTTP API.
- The age keyfile must be mode 0400 — `vault-import.sh` refuses to
source a keyfile with looser permissions.
- `VAULT_ADDR` must be localhost during import — the import tool
refuses to run against a remote Vault, preventing accidental exposure.

View file

@ -178,8 +178,8 @@ log "Tagged disinto/agents:${RELEASE_VERSION}"
log "Step 6/6: Restarting agent containers" log "Step 6/6: Restarting agent containers"
docker compose stop agents 2>/dev/null || true docker compose stop agents agents-llama 2>/dev/null || true
docker compose up -d agents docker compose up -d agents agents-llama
log "Agent containers restarted" log "Agent containers restarted"
# ── Done ───────────────────────────────────────────────────────────────── # ── Done ─────────────────────────────────────────────────────────────────

View file

@ -189,10 +189,10 @@ Restart agent containers to use the new image.
- docker compose pull agents - docker compose pull agents
2. Stop and remove existing agent containers: 2. Stop and remove existing agent containers:
- docker compose down agents - docker compose down agents agents-llama 2>/dev/null || true
3. Start agents with new image: 3. Start agents with new image:
- docker compose up -d agents - docker compose up -d agents agents-llama
4. Wait for containers to be healthy: 4. Wait for containers to be healthy:
- for i in {1..30}; do - for i in {1..30}; do
@ -203,7 +203,7 @@ Restart agent containers to use the new image.
- done - done
5. Verify containers are running: 5. Verify containers are running:
- docker compose ps agents - docker compose ps agents agents-llama
6. Log restart: 6. Log restart:
- echo "Restarted agents containers" - echo "Restarted agents containers"

View file

@ -29,7 +29,7 @@ and injected into your prompt above. Review them now.
1. Read the injected metrics data carefully (System Resources, Docker, 1. Read the injected metrics data carefully (System Resources, Docker,
Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs, Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs,
CI Pipelines, Open PRs, Issue Status, Stale Worktrees, **Woodpecker Agent Health**). CI Pipelines, Open PRs, Issue Status, Stale Worktrees).
Note: preflight.sh auto-removes PHASE:escalate files for closed issues Note: preflight.sh auto-removes PHASE:escalate files for closed issues
(24h grace period). Check the "Stale Phase Cleanup" section for any (24h grace period). Check the "Stale Phase Cleanup" section for any
files cleaned or in grace period this run. files cleaned or in grace period this run.
@ -75,10 +75,6 @@ Categorize every finding from the metrics into priority levels.
- Dev/action sessions in PHASE:escalate for > 24h (session timeout) - Dev/action sessions in PHASE:escalate for > 24h (session timeout)
(Note: PHASE:escalate files for closed issues are auto-cleaned by preflight; (Note: PHASE:escalate files for closed issues are auto-cleaned by preflight;
this check covers sessions where the issue is still open) this check covers sessions where the issue is still open)
- **Woodpecker agent unhealthy** see "Woodpecker Agent Health" section in preflight:
- Container not running or in unhealthy state
- gRPC errors >= 3 in last 20 minutes
- Fast-failure pipelines (duration < 60s) >= 3 in last 15 minutes
### P3 — Factory degraded ### P3 — Factory degraded
- PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed - PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed
@ -104,15 +100,6 @@ For each finding from the health assessment, decide and execute an action.
### Auto-fixable (execute these directly) ### Auto-fixable (execute these directly)
**P2 Woodpecker agent unhealthy:**
The supervisor-run.sh script automatically handles WP agent recovery:
- Detects unhealthy state via preflight.sh health checks
- Restarts container via `docker restart`
- Scans for `blocked: ci_exhausted` issues updated in last 30 minutes
- Unassigns and removes blocked label from affected issues
- Posts recovery comment with infra-flake context
- Avoids duplicate restarts via 5-minute cooldown in history file
**P0 Memory crisis:** **P0 Memory crisis:**
# Kill stale one-shot claude processes (>3h old) # Kill stale one-shot claude processes (>3h old)
pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
@ -261,11 +248,6 @@ Format:
- <what was fixed> - <what was fixed>
(or "No actions needed") (or "No actions needed")
### WP Agent Recovery (if applicable)
- WP agent restart: <time of restart or "none">
- Issues recovered: <count>
- Reason: <health check reason or "healthy">
### Vault items filed ### Vault items filed
- vault/pending/<id>.md <reason> - vault/pending/<id>.md <reason>
(or "None") (or "None")

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# Gardener Agent # Gardener Agent
**Role**: Backlog grooming — detect duplicate issues, missing acceptance **Role**: Backlog grooming — detect duplicate issues, missing acceptance

View file

@ -1 +0,0 @@
{"issue":915,"group":"lib/generators.sh","title":"remove no-op sed in generate_compose --build mode","reason":"sed replaces agents: with itself — no behavior change; single-line removal","ts":"2026-04-17T01:04:05Z"}

View file

@ -1 +1,7 @@
[] [
{
"action": "edit_body",
"issue": 835,
"body": "Bugfix for S0.1 (#821). Discovered during Step 0 end-to-end verification on a fresh LXC.\n\n## Symptom\n\n```\n$ ./bin/disinto init --backend=nomad --empty\nError: --empty is only valid with --backend=nomad\n```\n\nThe error is nonsensical — `--backend=nomad` is right there.\n\n## Root cause\n\n`bin/disinto` → `disinto_init` (around line 710) consumes the first positional arg as `repo_url` **before** the argparse `while` loop runs:\n\n```bash\ndisinto_init() {\n local repo_url=\"${1:-}\"\n if [ -z \"$repo_url\" ]; then\n echo \"Error: repo URL required\" >&2\n ...\n fi\n shift\n # ... then while-loop parses flags ...\n}\n```\n\nSo `disinto init --backend=nomad --empty` becomes:\n- `repo_url = \"--backend=nomad\"` (swallowed)\n- `--empty` seen by loop → `empty=true`\n- `backend` stays at default `\"docker\"`\n- Validation at line 747: `empty=true && backend != \"nomad\"` → error\n\n## Why repo_url is wrong for nomad\n\nFor `--backend=nomad`, the cluster-up flow doesn't clone anything — the LXC already has the repo cloned by the operator. `repo_url` is a docker-backend concept.\n\n## Fix\n\nIn `disinto_init`, move backend detection to **before** the `repo_url` consumption, and make `repo_url` conditional on `backend=docker`:\n\n```bash\ndisinto_init() {\n # Pre-scan for --backend to know whether repo_url is required\n local backend=\"docker\"\n for arg in \"$@\"; do\n case \"$arg\" in\n --backend) ;; # handled below\n --backend=*) backend=\"${arg#--backend=}\" ;;\n esac\n done\n # Also handle space-separated form\n local i=1\n while [ $i -le $# ]; do\n if [ \"${!i}\" = \"--backend\" ]; then\n i=$((i+1))\n backend=\"${!i}\"\n fi\n i=$((i+1))\n done\n\n local repo_url=\"\"\n if [ \"$backend\" = \"docker\" ]; then\n repo_url=\"${1:-}\"\n if [ -z \"$repo_url\" ] || [[ \"$repo_url\" == --* ]]; then\n echo \"Error: repo URL required for docker backend\" >&2\n echo \"Usage: disinto init <repo-url> [options]\" >&2\n exit 1\n fi\n shift\n fi\n # ... rest of argparse unchanged, it re-reads --backend cleanly\n```\n\nSimpler alternative: if first arg starts with `--`, assume no positional and skip repo_url consumption entirely (covers nomad + any future `--help`-style invocation).\n\nEither shape is fine; pick the cleaner one.\n\n## Acceptance criteria\n\n- [ ] `./bin/disinto init --backend=nomad --empty` runs `lib/init/nomad/cluster-up.sh` without error on a clean LXC.\n- [ ] `./bin/disinto init --backend=nomad --empty --dry-run` prints the 9-step plan and exits 0.\n- [ ] `./bin/disinto init <repo-url>` (docker path) behaves identically to today — existing smoke path passes.\n- [ ] `./bin/disinto init` (no args, docker implied) still errors with the \"repo URL required\" message.\n- [ ] `./bin/disinto init --backend=docker` (no repo) errors helpfully — not \"Unknown option: --backend=docker\".\n- [ ] shellcheck clean.\n\n## Verified regression case from Step 0 testing\n\nOn a fresh Ubuntu 24.04 LXC, after `./lib/init/nomad/cluster-up.sh` was invoked directly (workaround), the cluster came up healthy end-to-end:\n\n- Nomad node status: 1 node ready\n- Vault status: Sealed=false, Initialized=true\n- Re-run of cluster-up.sh was fully idempotent\n\nSo the bug is isolated to `bin/disinto` argparse; the rest of the Step 0 code path is solid. This fix unblocks the formal Step 0 acceptance test.\n\n## Labels / meta\n\n- `[nomad-step-0] S0.1-fix` — no dependencies; gates Step 1.\n\n## Affected files\n\n- `bin/disinto` — `disinto_init()` function, around line 710: pre-scan for `--backend` before consuming `repo_url` positional argument\n"
}
]

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# Shared Helpers (`lib/`) # Shared Helpers (`lib/`)
All agents source `lib/env.sh` as their first action. Additional helpers are All agents source `lib/env.sh` as their first action. Additional helpers are
@ -34,5 +34,5 @@ sourced as needed.
| `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses `<!-- filer:begin --> ... <!-- filer:end -->` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `<!-- decomposed-from: #<vision>, sprint: <slug>, id: <id> -->` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) | | `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses `<!-- filer:begin --> ... <!-- filer:end -->` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `<!-- decomposed-from: #<vision>, sprint: <slug>, id: <id> -->` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) |
| `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) | | `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) |
| `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) | | `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) |
| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_ensure_kv_v2(MOUNT, [LOG_PREFIX])` — idempotent KV v2 mount assertion: enables mount if absent, fails loudly if present as wrong type/version. Extracted from all `vault-seed-*.sh` scripts to eliminate dup-detector violations. Respects `DRY_RUN=1`. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh`, `tools/vault-seed-*.sh` | | `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | Not sourced at runtime yet — pure scaffolding for Nomad+Vault migration (#799) |
| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling. `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). `wp-oauth-register.sh` — S3.3 script that creates the Woodpecker OAuth2 app in Forgejo and stores `forgejo_client`/`forgejo_secret` in Vault KV v2 at `kv/disinto/shared/woodpecker`; idempotent (skips if app or secrets already present); called by `bin/disinto --with woodpecker`. Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` | | `lib/init/nomad/` | Nomad+Vault Step 0 installer scripts. `cluster-up.sh` — idempotent orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling. `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` |

View file

@ -128,6 +128,7 @@ vault_request() {
# Validate TOML content # Validate TOML content
local tmp_toml local tmp_toml
tmp_toml=$(mktemp /tmp/vault-XXXXXX.toml) tmp_toml=$(mktemp /tmp/vault-XXXXXX.toml)
trap 'rm -f "$tmp_toml"' RETURN
printf '%s' "$toml_content" > "$tmp_toml" printf '%s' "$toml_content" > "$tmp_toml"
@ -135,7 +136,6 @@ vault_request() {
local vault_env="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/action-vault/vault-env.sh" local vault_env="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/action-vault/vault-env.sh"
if [ ! -f "$vault_env" ]; then if [ ! -f "$vault_env" ]; then
echo "ERROR: vault-env.sh not found at $vault_env" >&2 echo "ERROR: vault-env.sh not found at $vault_env" >&2
rm -f "$tmp_toml"
return 1 return 1
fi fi
@ -145,15 +145,11 @@ vault_request() {
if ! source "$vault_env"; then if ! source "$vault_env"; then
FORGE_TOKEN="${_saved_forge_token:-}" FORGE_TOKEN="${_saved_forge_token:-}"
echo "ERROR: failed to source vault-env.sh" >&2 echo "ERROR: failed to source vault-env.sh" >&2
rm -f "$tmp_toml"
return 1 return 1
fi fi
# Restore caller's FORGE_TOKEN after validation # Restore caller's FORGE_TOKEN after validation
FORGE_TOKEN="${_saved_forge_token:-}" FORGE_TOKEN="${_saved_forge_token:-}"
# Set trap AFTER sourcing vault-env.sh to avoid RETURN trap firing during source
trap 'rm -f "$tmp_toml"' RETURN
# Run validation # Run validation
if ! validate_vault_action "$tmp_toml"; then if ! validate_vault_action "$tmp_toml"; then
echo "ERROR: TOML validation failed" >&2 echo "ERROR: TOML validation failed" >&2

View file

@ -356,6 +356,16 @@ setup_forge() {
[predictor-bot]="FORGE_PREDICTOR_PASS" [predictor-bot]="FORGE_PREDICTOR_PASS"
[architect-bot]="FORGE_ARCHITECT_PASS" [architect-bot]="FORGE_ARCHITECT_PASS"
) )
# Llama bot users (local-model agents) — separate from main agents
# Each llama agent gets its own Forgejo user, token, and password
local -A llama_token_vars=(
[dev-qwen]="FORGE_TOKEN_LLAMA"
[dev-qwen-nightly]="FORGE_TOKEN_LLAMA_NIGHTLY"
)
local -A llama_pass_vars=(
[dev-qwen]="FORGE_PASS_LLAMA"
[dev-qwen-nightly]="FORGE_PASS_LLAMA_NIGHTLY"
)
local bot_user bot_pass token token_var pass_var local bot_user bot_pass token token_var pass_var
@ -505,12 +515,159 @@ setup_forge() {
fi fi
done done
# Create llama bot users and tokens (local-model agents)
# These are separate from the main agents and get their own credentials
echo ""
echo "── Setting up llama bot users ────────────────────────────"
local llama_user llama_pass llama_token llama_token_var llama_pass_var
for llama_user in "${!llama_token_vars[@]}"; do
llama_token_var="${llama_token_vars[$llama_user]}"
llama_pass_var="${llama_pass_vars[$llama_user]}"
# Check if token already exists in .env
local token_exists=false
if _token_exists_in_env "$llama_token_var" "$env_file"; then
token_exists=true
fi
# Check if password already exists in .env
local pass_exists=false
if _pass_exists_in_env "$llama_pass_var" "$env_file"; then
pass_exists=true
fi
# Check if llama bot user exists on Forgejo
local llama_user_exists=false
if curl -sf --max-time 5 \
-H "Authorization: token ${admin_token}" \
"${forge_url}/api/v1/users/${llama_user}" >/dev/null 2>&1; then
llama_user_exists=true
fi
# Skip token/password regeneration if both exist in .env and not forcing rotation
if [ "$token_exists" = true ] && [ "$pass_exists" = true ] && [ "$rotate_tokens" = false ]; then
echo " ${llama_user} token and password preserved (use --rotate-tokens to force)"
# Still export the existing token for use within this run
local existing_token existing_pass
existing_token=$(grep "^${llama_token_var}=" "$env_file" | head -1 | cut -d= -f2-)
existing_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
export "${llama_token_var}=${existing_token}"
export "${llama_pass_var}=${existing_pass}"
continue
fi
# Generate new credentials if:
# - Token doesn't exist (first run)
# - Password doesn't exist (first run)
# - --rotate-tokens flag is set (explicit rotation)
if [ "$llama_user_exists" = false ]; then
# User doesn't exist - create it
llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
echo "Creating llama bot user: ${llama_user}"
local create_output
if ! create_output=$(_forgejo_exec forgejo admin user create \
--username "${llama_user}" \
--password "${llama_pass}" \
--email "${llama_user}@disinto.local" \
--must-change-password=false 2>&1); then
echo "Error: failed to create llama bot user '${llama_user}':" >&2
echo " ${create_output}" >&2
exit 1
fi
# Forgejo 11.x ignores --must-change-password=false on create;
# explicitly clear the flag so basic-auth token creation works.
_forgejo_exec forgejo admin user change-password \
--username "${llama_user}" \
--password "${llama_pass}" \
--must-change-password=false
# Verify llama bot user was actually created
if ! curl -sf --max-time 5 \
-H "Authorization: token ${admin_token}" \
"${forge_url}/api/v1/users/${llama_user}" >/dev/null 2>&1; then
echo "Error: llama bot user '${llama_user}' not found after creation" >&2
exit 1
fi
echo " ${llama_user} user created"
else
# User exists - reset password if needed
echo " ${llama_user} user exists"
if [ "$rotate_tokens" = true ] || [ "$pass_exists" = false ]; then
llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
_forgejo_exec forgejo admin user change-password \
--username "${llama_user}" \
--password "${llama_pass}" \
--must-change-password=false || {
echo "Error: failed to reset password for existing llama bot user '${llama_user}'" >&2
exit 1
}
echo " ${llama_user} password reset for token generation"
else
# Password exists, get it from .env
llama_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
fi
fi
# Generate token via API (basic auth as the llama user)
# First, delete any existing tokens to avoid name collision
local existing_llama_token_ids
existing_llama_token_ids=$(curl -sf \
-u "${llama_user}:${llama_pass}" \
"${forge_url}/api/v1/users/${llama_user}/tokens" 2>/dev/null \
| jq -r '.[].id // empty' 2>/dev/null) || existing_llama_token_ids=""
# Delete any existing tokens for this user
if [ -n "$existing_llama_token_ids" ]; then
while IFS= read -r tid; do
[ -n "$tid" ] && curl -sf -X DELETE \
-u "${llama_user}:${llama_pass}" \
"${forge_url}/api/v1/users/${llama_user}/tokens/${tid}" >/dev/null 2>&1 || true
done <<< "$existing_llama_token_ids"
fi
llama_token=$(curl -sf -X POST \
-u "${llama_user}:${llama_pass}" \
-H "Content-Type: application/json" \
"${forge_url}/api/v1/users/${llama_user}/tokens" \
-d "{\"name\":\"disinto-${llama_user}-token\",\"scopes\":[\"all\"]}" 2>/dev/null \
| jq -r '.sha1 // empty') || llama_token=""
if [ -z "$llama_token" ]; then
echo "Error: failed to create API token for '${llama_user}'" >&2
exit 1
fi
# Store token in .env under the llama-specific variable name
if grep -q "^${llama_token_var}=" "$env_file" 2>/dev/null; then
sed -i "s|^${llama_token_var}=.*|${llama_token_var}=${llama_token}|" "$env_file"
else
printf '%s=%s\n' "$llama_token_var" "$llama_token" >> "$env_file"
fi
export "${llama_token_var}=${llama_token}"
echo " ${llama_user} token generated and saved (${llama_token_var})"
# Store password in .env for git HTTP push (#361)
# Forgejo 11.x API tokens don't work for git push; password auth does.
if grep -q "^${llama_pass_var}=" "$env_file" 2>/dev/null; then
sed -i "s|^${llama_pass_var}=.*|${llama_pass_var}=${llama_pass}|" "$env_file"
else
printf '%s=%s\n' "$llama_pass_var" "$llama_pass" >> "$env_file"
fi
export "${llama_pass_var}=${llama_pass}"
echo " ${llama_user} password saved (${llama_pass_var})"
done
# Create .profile repos for all bot users (if they don't already exist) # Create .profile repos for all bot users (if they don't already exist)
# This runs the same logic as hire-an-agent Step 2-3 for idempotent setup # This runs the same logic as hire-an-agent Step 2-3 for idempotent setup
echo "" echo ""
echo "── Setting up .profile repos ────────────────────────────" echo "── Setting up .profile repos ────────────────────────────"
local -a bot_users=(dev-bot review-bot planner-bot gardener-bot vault-bot supervisor-bot predictor-bot architect-bot) local -a bot_users=(dev-bot review-bot planner-bot gardener-bot vault-bot supervisor-bot predictor-bot architect-bot)
# Add llama bot users to .profile repo creation
for llama_user in "${!llama_token_vars[@]}"; do
bot_users+=("$llama_user")
done
local bot_user local bot_user
for bot_user in "${bot_users[@]}"; do for bot_user in "${bot_users[@]}"; do
@ -618,6 +775,15 @@ setup_forge() {
-d "{\"permission\":\"${bot_perm}\"}" >/dev/null 2>&1 || true -d "{\"permission\":\"${bot_perm}\"}" >/dev/null 2>&1 || true
done done
# Add llama bot users as write collaborators for local-model agents
for llama_user in "${!llama_token_vars[@]}"; do
curl -sf -X PUT \
-H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
-H "Content-Type: application/json" \
"${forge_url}/api/v1/repos/${repo_slug}/collaborators/${llama_user}" \
-d '{"permission":"write"}' >/dev/null 2>&1 || true
done
# Add disinto-admin as admin collaborator # Add disinto-admin as admin collaborator
curl -sf -X PUT \ curl -sf -X PUT \
-H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \ -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \

View file

@ -66,6 +66,28 @@ _get_primary_woodpecker_repo_id() {
echo "$max_id" echo "$max_id"
} }
# Track service names for duplicate detection
declare -A _seen_services
declare -A _service_sources
# Record a service name and its source; return 0 if unique, 1 if duplicate
_record_service() {
local service_name="$1"
local source="$2"
if [ -n "${_seen_services[$service_name]:-}" ]; then
local original_source="${_service_sources[$service_name]}"
echo "ERROR: Duplicate service name '$service_name' detected —" >&2
echo " '$service_name' emitted twice — from $original_source and from $source" >&2
echo " Remove one of the conflicting activations to proceed." >&2
return 1
fi
_seen_services[$service_name]=1
_service_sources[$service_name]="$source"
return 0
}
# Parse project TOML for local-model agents and emit compose services. # Parse project TOML for local-model agents and emit compose services.
# Writes service definitions to stdout; caller handles insertion into compose file. # Writes service definitions to stdout; caller handles insertion into compose file.
_generate_local_model_services() { _generate_local_model_services() {
@ -97,6 +119,16 @@ _generate_local_model_services() {
POLL_INTERVAL) poll_interval_val="$value" ;; POLL_INTERVAL) poll_interval_val="$value" ;;
---) ---)
if [ -n "$service_name" ] && [ -n "$base_url" ]; then if [ -n "$service_name" ] && [ -n "$base_url" ]; then
# Record service for duplicate detection using the full service name
local full_service_name="agents-${service_name}"
local toml_basename
toml_basename=$(basename "$toml")
if ! _record_service "$full_service_name" "[agents.$service_name] in projects/$toml_basename"; then
# Duplicate detected — clean up and abort
rm -f "$temp_file"
return 1
fi
# Per-agent FORGE_TOKEN / FORGE_PASS lookup (#834 Gap 3). # Per-agent FORGE_TOKEN / FORGE_PASS lookup (#834 Gap 3).
# Two hired llama agents must not share the same Forgejo identity, # Two hired llama agents must not share the same Forgejo identity,
# so we key the env-var lookup by forge_user (which hire-agent.sh # so we key the env-var lookup by forge_user (which hire-agent.sh
@ -109,25 +141,15 @@ _generate_local_model_services() {
# the service caused `disinto up` (without COMPOSE_PROFILES) # the service caused `disinto up` (without COMPOSE_PROFILES)
# to treat the hired container as an orphan and silently # to treat the hired container as an orphan and silently
# remove it via --remove-orphans. # remove it via --remove-orphans.
# Compute the actual service name that will be emitted
local full_service_name="agents-${service_name}"
local user_upper local user_upper
user_upper=$(echo "$forge_user" | tr 'a-z-' 'A-Z_') user_upper=$(echo "$forge_user" | tr 'a-z-' 'A-Z_')
cat >> "$temp_file" <<EOF cat >> "$temp_file" <<EOF
agents-${service_name}: ${full_service_name}:
# Local image ref (#853): registry-less name matches what \`disinto init --build\` image: ghcr.io/disinto/agents:\${DISINTO_IMAGE_TAG:-latest}
# and the legacy agents-llama stanza produce. Paired with build: so hosts without
# a pre-built image can rebuild locally; ghcr.io/disinto/agents is not publicly
# pullable, and emitting that prefix caused \`docker compose up\` to fail with
# \`denied\` on every hired agent.
build:
context: .
dockerfile: docker/agents/Dockerfile
image: disinto/agents:\${DISINTO_IMAGE_TAG:-latest}
# Rebuild on every up (#887): without this, \`docker compose up -d --force-recreate\`
# reuses the cached image and silently keeps running stale docker/agents/ code
# even after the repo is updated. \`pull_policy: build\` makes Compose rebuild
# the image on every up; BuildKit layer cache makes unchanged rebuilds fast.
pull_policy: build
container_name: disinto-agents-${service_name} container_name: disinto-agents-${service_name}
restart: unless-stopped restart: unless-stopped
security_opt: security_opt:
@ -139,13 +161,9 @@ _generate_local_model_services() {
- \${CLAUDE_CONFIG_FILE:-\${HOME}/.claude.json}:/home/agent/.claude.json:ro - \${CLAUDE_CONFIG_FILE:-\${HOME}/.claude.json}:/home/agent/.claude.json:ro
- \${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - \${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- \${AGENT_SSH_DIR:-\${HOME}/.ssh}:/home/agent/.ssh:ro - \${AGENT_SSH_DIR:-\${HOME}/.ssh}:/home/agent/.ssh:ro
- ./projects:/home/agent/disinto/projects:ro
- ./.env:/home/agent/disinto/.env:ro
- ./state:/home/agent/disinto/state
environment: environment:
FORGE_URL: http://forgejo:3000 FORGE_URL: http://forgejo:3000
FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto} FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
FACTORY_REPO: ${FORGE_REPO:-disinto-admin/disinto}
# Per-agent credentials keyed by forge_user (#834 Gap 3). # Per-agent credentials keyed by forge_user (#834 Gap 3).
FORGE_TOKEN: \${FORGE_TOKEN_${user_upper}:-} FORGE_TOKEN: \${FORGE_TOKEN_${user_upper}:-}
FORGE_PASS: \${FORGE_PASS_${user_upper}:-} FORGE_PASS: \${FORGE_PASS_${user_upper}:-}
@ -282,6 +300,17 @@ _generate_compose_impl() {
return 0 return 0
fi fi
# Initialize duplicate detection with base services defined in the template
_record_service "agents" "base compose template" || return 1
_record_service "forgejo" "base compose template" || return 1
_record_service "woodpecker" "base compose template" || return 1
_record_service "woodpecker-agent" "base compose template" || return 1
_record_service "runner" "base compose template" || return 1
_record_service "edge" "base compose template" || return 1
_record_service "staging" "base compose template" || return 1
_record_service "staging-deploy" "base compose template" || return 1
_record_service "chat" "base compose template" || return 1
# Extract primary woodpecker_repo_id from project TOML files # Extract primary woodpecker_repo_id from project TOML files
local wp_repo_id local wp_repo_id
wp_repo_id=$(_get_primary_woodpecker_repo_id) wp_repo_id=$(_get_primary_woodpecker_repo_id)
@ -438,6 +467,138 @@ services:
COMPOSEEOF COMPOSEEOF
# ── Conditional agents-llama block (ENABLE_LLAMA_AGENT=1) ──────────────
# Local-Qwen dev agent — gated on ENABLE_LLAMA_AGENT so factories without
# a local llama endpoint don't try to start it. See docs/agents-llama.md.
if [ "${ENABLE_LLAMA_AGENT:-0}" = "1" ]; then
# Check for duplicate with TOML-configured agents
if ! _record_service "agents-llama" "ENABLE_LLAMA_AGENT=1"; then
return 1
fi
if ! _record_service "agents-llama-all" "ENABLE_LLAMA_AGENT=1"; then
return 1
fi
cat >> "$compose_file" <<'LLAMAEOF'
agents-llama:
build:
context: .
dockerfile: docker/agents/Dockerfile
container_name: disinto-agents-llama
restart: unless-stopped
security_opt:
- apparmor=unconfined
volumes:
- agent-data:/home/agent/data
- project-repos:/home/agent/repos
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
- ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
- ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
- woodpecker-data:/woodpecker-data:ro
environment:
FORGE_URL: http://forgejo:3000
FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
FORGE_TOKEN: ${FORGE_TOKEN_LLAMA:-}
FORGE_PASS: ${FORGE_PASS_LLAMA:-}
FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-}
WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-}
CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200}
CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "60"
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-}
FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-}
DISINTO_CONTAINER: "1"
PROJECT_NAME: ${PROJECT_NAME:-project}
PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
WOODPECKER_DATA_DIR: /woodpecker-data
WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID"
CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
POLL_INTERVAL: ${POLL_INTERVAL:-300}
AGENT_ROLES: dev
healthcheck:
test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
interval: 60s
timeout: 5s
retries: 3
start_period: 30s
depends_on:
forgejo:
condition: service_healthy
networks:
- disinto-net
agents-llama-all:
build:
context: .
dockerfile: docker/agents/Dockerfile
container_name: disinto-agents-llama-all
restart: unless-stopped
profiles: ["agents-llama-all"]
security_opt:
- apparmor=unconfined
volumes:
- agent-data:/home/agent/data
- project-repos:/home/agent/repos
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
- ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
- ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
- woodpecker-data:/woodpecker-data:ro
environment:
FORGE_URL: http://forgejo:3000
FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
FORGE_TOKEN: ${FORGE_TOKEN_LLAMA:-}
FORGE_PASS: ${FORGE_PASS_LLAMA:-}
FORGE_REVIEW_TOKEN: ${FORGE_REVIEW_TOKEN:-}
FORGE_PLANNER_TOKEN: ${FORGE_PLANNER_TOKEN:-}
FORGE_GARDENER_TOKEN: ${FORGE_GARDENER_TOKEN:-}
FORGE_VAULT_TOKEN: ${FORGE_VAULT_TOKEN:-}
FORGE_SUPERVISOR_TOKEN: ${FORGE_SUPERVISOR_TOKEN:-}
FORGE_PREDICTOR_TOKEN: ${FORGE_PREDICTOR_TOKEN:-}
FORGE_ARCHITECT_TOKEN: ${FORGE_ARCHITECT_TOKEN:-}
FORGE_FILER_TOKEN: ${FORGE_FILER_TOKEN:-}
FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-}
WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-}
CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200}
CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "60"
CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: "1"
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-}
FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-}
DISINTO_CONTAINER: "1"
PROJECT_NAME: ${PROJECT_NAME:-project}
PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
WOODPECKER_DATA_DIR: /woodpecker-data
WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID"
CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
POLL_INTERVAL: ${POLL_INTERVAL:-300}
GARDENER_INTERVAL: ${GARDENER_INTERVAL:-21600}
ARCHITECT_INTERVAL: ${ARCHITECT_INTERVAL:-21600}
PLANNER_INTERVAL: ${PLANNER_INTERVAL:-43200}
SUPERVISOR_INTERVAL: ${SUPERVISOR_INTERVAL:-1200}
AGENT_ROLES: review,dev,gardener,architect,planner,predictor,supervisor
healthcheck:
test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
interval: 60s
timeout: 5s
retries: 3
start_period: 30s
depends_on:
forgejo:
condition: service_healthy
woodpecker:
condition: service_started
networks:
- disinto-net
LLAMAEOF
fi
# Resume the rest of the compose file (runner onward) # Resume the rest of the compose file (runner onward)
cat >> "$compose_file" <<'COMPOSEEOF' cat >> "$compose_file" <<'COMPOSEEOF'
@ -633,7 +794,10 @@ COMPOSEEOF
fi fi
# Append local-model agent services if any are configured # Append local-model agent services if any are configured
_generate_local_model_services "$compose_file" if ! _generate_local_model_services "$compose_file"; then
echo "ERROR: Failed to generate local-model agent services. See errors above." >&2
return 1
fi
# Resolve the Claude CLI binary path and persist as CLAUDE_BIN_DIR in .env. # Resolve the Claude CLI binary path and persist as CLAUDE_BIN_DIR in .env.
# docker-compose.yml references ${CLAUDE_BIN_DIR} so the value must be set. # docker-compose.yml references ${CLAUDE_BIN_DIR} so the value must be set.
@ -660,8 +824,8 @@ COMPOSEEOF
# In build mode, replace image: with build: for locally-built images # In build mode, replace image: with build: for locally-built images
if [ "$use_build" = true ]; then if [ "$use_build" = true ]; then
sed -i 's|^\( agents:\)|\1|' "$compose_file" sed -i 's|^\( agents:\)|\1|' "$compose_file"
sed -i '/^ image: ghcr\.io\/disinto\/agents:/{s|image: ghcr\.io/disinto/agents:.*|build:\n context: .\n dockerfile: docker/agents/Dockerfile\n pull_policy: build|}' "$compose_file" sed -i '/^ image: ghcr\.io\/disinto\/agents:/{s|image: ghcr\.io/disinto/agents:.*|build:\n context: .\n dockerfile: docker/agents/Dockerfile|}' "$compose_file"
sed -i '/^ image: ghcr\.io\/disinto\/edge:/{s|image: ghcr\.io/disinto/edge:.*|build: ./docker/edge\n pull_policy: build|}' "$compose_file" sed -i '/^ image: ghcr\.io\/disinto\/edge:/{s|image: ghcr\.io/disinto/edge:.*|build: ./docker/edge|}' "$compose_file"
fi fi
echo "Created: ${compose_file}" echo "Created: ${compose_file}"

View file

@ -535,10 +535,7 @@ EOF
local interval="${poll_interval:-60}" local interval="${poll_interval:-60}"
echo " Writing [agents.${section_name}] to ${toml_file}..." echo " Writing [agents.${section_name}] to ${toml_file}..."
python3 -c ' python3 -c '
import sys import sys, re, pathlib
import tomlkit
import re
import pathlib
toml_path = sys.argv[1] toml_path = sys.argv[1]
section_name = sys.argv[2] section_name = sys.argv[2]
@ -551,39 +548,38 @@ poll_interval = sys.argv[7]
p = pathlib.Path(toml_path) p = pathlib.Path(toml_path)
text = p.read_text() text = p.read_text()
# Step 1: Remove any commented-out [agents.X] blocks (they cause parse issues) # Build the new section
# Match # [agents.section_name] followed by lines that are not section headers new_section = f"""
# Use negative lookahead to stop before a real section header (# [ or [) [agents.{section_name}]
commented_pattern = rf"(?:^|\n)# \[agents\.{re.escape(section_name)}\](?:\n(?!# \[|\[)[^\n]*)*" base_url = "{base_url}"
text = re.sub(commented_pattern, "", text, flags=re.DOTALL) model = "{model}"
api_key = "sk-no-key-required"
roles = ["{role}"]
forge_user = "{agent_name}"
compact_pct = 60
poll_interval = {poll_interval}
"""
# Step 2: Parse TOML with tomlkit (preserves comments and formatting) # Check if section already exists and replace it
try: pattern = rf"\[agents\.{re.escape(section_name)}\][^\[]*"
doc = tomlkit.parse(text) if re.search(pattern, text):
except Exception as e: text = re.sub(pattern, new_section.strip() + "\n", text)
print(f"Error: Invalid TOML in {toml_path}: {e}", file=sys.stderr) else:
sys.exit(1) # Remove commented-out example [agents.llama] block if present
text = re.sub(
r"\n# Local-model agents \(optional\).*?(?=\n# \[mirrors\]|\n\[mirrors\]|\Z)",
"",
text,
flags=re.DOTALL,
)
# Append before [mirrors] if it exists, otherwise at end
mirrors_match = re.search(r"\n(# )?\[mirrors\]", text)
if mirrors_match:
text = text[:mirrors_match.start()] + "\n" + new_section + text[mirrors_match.start():]
else:
text = text.rstrip() + "\n" + new_section
# Step 3: Ensure agents table exists p.write_text(text)
if "agents" not in doc:
doc.add("agents", tomlkit.table())
# Step 4: Update the specific agent section
doc["agents"][section_name] = {
"base_url": base_url,
"model": model,
"api_key": "sk-no-key-required",
"roles": [role],
"forge_user": agent_name,
"compact_pct": 60,
"poll_interval": int(poll_interval),
}
# Step 5: Serialize back to TOML (preserves comments)
output = tomlkit.dumps(doc)
# Step 6: Write back
p.write_text(output)
' "$toml_file" "$section_name" "$local_model" "$model" "$agent_name" "$role" "$interval" ' "$toml_file" "$section_name" "$local_model" "$model" "$agent_name" "$role" "$interval"
echo " Agent config written to TOML" echo " Agent config written to TOML"

View file

@ -38,30 +38,6 @@ _hvault_resolve_token() {
return 1 return 1
} }
# _hvault_default_env — set the local-cluster Vault env if unset
#
# Idempotent helper used by every Vault-touching script that runs during
# `disinto init` (S2). On the local-cluster common case, operators (and
# the init dispatcher in bin/disinto) have not exported VAULT_ADDR or
# VAULT_TOKEN — the server is reachable on localhost:8200 and the root
# token lives at /etc/vault.d/root.token. Scripts must Just Work in that
# shape.
#
# - If VAULT_ADDR is unset, defaults to http://127.0.0.1:8200.
# - If VAULT_TOKEN is unset, resolves from /etc/vault.d/root.token via
# _hvault_resolve_token. A missing token file is not an error here —
# downstream hvault_token_lookup() probes connectivity and emits the
# operator-facing "VAULT_ADDR + VAULT_TOKEN" diagnostic.
#
# Centralised to keep the defaulting stanza in one place — copy-pasting
# the 5-line block into each init script trips the repo-wide 5-line
# sliding-window duplicate detector (.woodpecker/detect-duplicates.py).
_hvault_default_env() {
VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}"
export VAULT_ADDR
_hvault_resolve_token || :
}
# _hvault_check_prereqs — validate VAULT_ADDR and VAULT_TOKEN are set # _hvault_check_prereqs — validate VAULT_ADDR and VAULT_TOKEN are set
# Args: caller function name # Args: caller function name
_hvault_check_prereqs() { _hvault_check_prereqs() {
@ -124,65 +100,6 @@ _hvault_request() {
# ── Public API ─────────────────────────────────────────────────────────────── # ── Public API ───────────────────────────────────────────────────────────────
# VAULT_KV_MOUNT — KV v2 mount point (default: "kv")
# Override with: export VAULT_KV_MOUNT=secret
# Used by: hvault_kv_get, hvault_kv_put, hvault_kv_list
: "${VAULT_KV_MOUNT:=kv}"
# hvault_ensure_kv_v2 MOUNT [LOG_PREFIX]
# Assert that the given KV mount is present and KV v2. If absent, enable
# it. If present as wrong type/version, exit 1. Callers must have already
# checked VAULT_ADDR / VAULT_TOKEN.
#
# DRY_RUN (env, default 0): when 1, log intent without writing.
# LOG_PREFIX (optional): label for log lines, e.g. "[vault-seed-forgejo]".
#
# Extracted here because every vault-seed-*.sh script needs this exact
# sequence, and the 5-line sliding-window dup detector flags the
# copy-paste. One place, one implementation.
hvault_ensure_kv_v2() {
local mount="${1:?hvault_ensure_kv_v2: MOUNT required}"
local prefix="${2:-[hvault]}"
local dry_run="${DRY_RUN:-0}"
local mounts_json mount_exists mount_type mount_version
mounts_json="$(hvault_get_or_empty "sys/mounts")" \
|| { printf '%s ERROR: failed to list Vault mounts\n' "$prefix" >&2; return 1; }
mount_exists=false
if printf '%s' "$mounts_json" | jq -e --arg m "${mount}/" '.[$m]' >/dev/null 2>&1; then
mount_exists=true
fi
if [ "$mount_exists" = true ]; then
mount_type="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${mount}/" '.[$m].type // ""')"
mount_version="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${mount}/" '.[$m].options.version // "1"')"
if [ "$mount_type" != "kv" ]; then
printf '%s ERROR: %s/ is mounted as type=%q, expected kv — refuse to re-mount\n' \
"$prefix" "$mount" "$mount_type" >&2
return 1
fi
if [ "$mount_version" != "2" ]; then
printf '%s ERROR: %s/ is KV v%s, expected v2 — refuse to upgrade in place\n' \
"$prefix" "$mount" "$mount_version" >&2
return 1
fi
printf '%s %s/ already mounted (kv v2) — skipping enable\n' "$prefix" "$mount"
else
if [ "$dry_run" -eq 1 ]; then
printf '%s [dry-run] would enable %s/ as kv v2\n' "$prefix" "$mount"
else
local payload
payload="$(jq -n '{type:"kv",options:{version:"2"},description:"disinto shared KV v2 (S2.4)"}')"
_hvault_request POST "sys/mounts/${mount}" "$payload" >/dev/null \
|| { printf '%s ERROR: failed to enable %s/ as kv v2\n' "$prefix" "$mount" >&2; return 1; }
printf '%s %s/ enabled as kv v2\n' "$prefix" "$mount"
fi
fi
}
# hvault_kv_get PATH [KEY] # hvault_kv_get PATH [KEY]
# Read a KV v2 secret at PATH, optionally extract a single KEY. # Read a KV v2 secret at PATH, optionally extract a single KEY.
# Outputs: JSON value (full data object, or single key value) # Outputs: JSON value (full data object, or single key value)
@ -197,7 +114,7 @@ hvault_kv_get() {
_hvault_check_prereqs "hvault_kv_get" || return 1 _hvault_check_prereqs "hvault_kv_get" || return 1
local response local response
response="$(_hvault_request GET "${VAULT_KV_MOUNT}/data/${path}")" || return 1 response="$(_hvault_request GET "secret/data/${path}")" || return 1
if [ -n "$key" ]; then if [ -n "$key" ]; then
printf '%s' "$response" | jq -e -r --arg key "$key" '.data.data[$key]' 2>/dev/null || { printf '%s' "$response" | jq -e -r --arg key "$key" '.data.data[$key]' 2>/dev/null || {
@ -237,7 +154,7 @@ hvault_kv_put() {
payload="$(printf '%s' "$payload" | jq --arg k "$k" --arg v "$v" '.data[$k] = $v')" payload="$(printf '%s' "$payload" | jq --arg k "$k" --arg v "$v" '.data[$k] = $v')"
done done
_hvault_request POST "${VAULT_KV_MOUNT}/data/${path}" "$payload" >/dev/null _hvault_request POST "secret/data/${path}" "$payload" >/dev/null
} }
# hvault_kv_list PATH # hvault_kv_list PATH
@ -253,7 +170,7 @@ hvault_kv_list() {
_hvault_check_prereqs "hvault_kv_list" || return 1 _hvault_check_prereqs "hvault_kv_list" || return 1
local response local response
response="$(_hvault_request LIST "${VAULT_KV_MOUNT}/metadata/${path}")" || return 1 response="$(_hvault_request LIST "secret/metadata/${path}")" || return 1
printf '%s' "$response" | jq -e '.data.keys' 2>/dev/null || { printf '%s' "$response" | jq -e '.data.keys' 2>/dev/null || {
_hvault_err "hvault_kv_list" "failed to parse response" "path=$path" _hvault_err "hvault_kv_list" "failed to parse response" "path=$path"
@ -261,51 +178,6 @@ hvault_kv_list() {
} }
} }
# hvault_get_or_empty PATH
# GET /v1/PATH. On 200, prints the raw response body to stdout (caller
# parses with jq). On 404, prints nothing and returns 0 — caller treats
# the empty string as "resource absent, needs create". Any other HTTP
# status is a hard error: response body is logged to stderr as a
# structured JSON error and the function returns 1.
#
# Used by the sync scripts (tools/vault-apply-*.sh +
# lib/init/nomad/vault-nomad-auth.sh) to read existing policies, roles,
# auth-method listings, and per-role configs without triggering errexit
# on the expected absent-resource case. `_hvault_request` is not a
# substitute — it treats 404 as a hard error, which is correct for
# writes but wrong for "does this already exist?" checks.
#
# Subshell + EXIT trap: the RETURN trap does NOT fire on set-e abort,
# so tmpfile cleanup from a function-scoped RETURN trap would leak on
# jq/curl errors under `set -eo pipefail`. The subshell + EXIT trap
# is the reliable cleanup boundary.
hvault_get_or_empty() {
local path="${1:-}"
if [ -z "$path" ]; then
_hvault_err "hvault_get_or_empty" "PATH is required" \
"usage: hvault_get_or_empty PATH"
return 1
fi
_hvault_check_prereqs "hvault_get_or_empty" || return 1
(
local tmp http_code
tmp="$(mktemp)"
trap 'rm -f "$tmp"' EXIT
http_code="$(curl -sS -o "$tmp" -w '%{http_code}' \
-H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/${path}")" \
|| { _hvault_err "hvault_get_or_empty" "curl failed" "path=$path"; exit 1; }
case "$http_code" in
2[0-9][0-9]) cat "$tmp" ;;
404) printf '' ;;
*) _hvault_err "hvault_get_or_empty" "HTTP $http_code" "$(cat "$tmp")"
exit 1 ;;
esac
)
}
# hvault_policy_apply NAME FILE # hvault_policy_apply NAME FILE
# Idempotent policy upsert — create or update a Vault policy. # Idempotent policy upsert — create or update a Vault policy.
hvault_policy_apply() { hvault_policy_apply() {

View file

@ -5,7 +5,7 @@
# Wires together the S0.1S0.3 building blocks into one idempotent # Wires together the S0.1S0.3 building blocks into one idempotent
# "bring up a single-node Nomad+Vault cluster" script: # "bring up a single-node Nomad+Vault cluster" script:
# #
# 1. install.sh (nomad + vault binaries + docker daemon) # 1. install.sh (nomad + vault binaries)
# 2. systemd-nomad.sh (nomad.service — unit + enable, not started) # 2. systemd-nomad.sh (nomad.service — unit + enable, not started)
# 3. systemd-vault.sh (vault.service — unit + vault.hcl + enable) # 3. systemd-vault.sh (vault.service — unit + vault.hcl + enable)
# 4. Host-volume dirs (/srv/disinto/* matching nomad/client.hcl) # 4. Host-volume dirs (/srv/disinto/* matching nomad/client.hcl)
@ -104,7 +104,7 @@ done
# ── Dry-run: print step list + exit ────────────────────────────────────────── # ── Dry-run: print step list + exit ──────────────────────────────────────────
if [ "$dry_run" = true ]; then if [ "$dry_run" = true ]; then
cat <<EOF cat <<EOF
[dry-run] Step 1/9: install nomad + vault binaries + docker daemon [dry-run] Step 1/9: install nomad + vault binaries
→ sudo ${INSTALL_SH} → sudo ${INSTALL_SH}
[dry-run] Step 2/9: write + enable nomad.service (NOT started) [dry-run] Step 2/9: write + enable nomad.service (NOT started)
@ -129,7 +129,7 @@ EOF
[dry-run] Step 7/9: systemctl start vault + poll until unsealed (${VAULT_POLL_SECS}s) [dry-run] Step 7/9: systemctl start vault + poll until unsealed (${VAULT_POLL_SECS}s)
[dry-run] Step 8/9: systemctl start nomad + poll until ≥1 node ready + docker driver healthy (${NOMAD_POLL_SECS}s each) [dry-run] Step 8/9: systemctl start nomad + poll until ≥1 node ready (${NOMAD_POLL_SECS}s)
[dry-run] Step 9/9: write ${PROFILE_D_FILE} [dry-run] Step 9/9: write ${PROFILE_D_FILE}
export VAULT_ADDR=${VAULT_ADDR_DEFAULT} export VAULT_ADDR=${VAULT_ADDR_DEFAULT}
@ -210,21 +210,6 @@ nomad_ready_count() {
# so poll_until_healthy can call it as a single-arg command name. # so poll_until_healthy can call it as a single-arg command name.
nomad_has_ready_node() { [ "$(nomad_ready_count)" -ge 1 ]; } nomad_has_ready_node() { [ "$(nomad_ready_count)" -ge 1 ]; }
# nomad_docker_driver_healthy — true iff the nomad self-node reports the
# docker driver as Detected=true AND Healthy=true. Required by Step-1's
# forgejo jobspec (the first docker-driver consumer) — without this the
# node reaches "ready" while docker fingerprinting is still in flight,
# and the first `nomad job run forgejo` times out with an opaque
# "missing drivers" placement failure (#871).
nomad_docker_driver_healthy() {
local out detected healthy
out="$(NOMAD_ADDR="$NOMAD_ADDR_DEFAULT" nomad node status -self -json 2>/dev/null || true)"
[ -n "$out" ] || return 1
detected="$(printf '%s' "$out" | jq -r '.Drivers.docker.Detected // false' 2>/dev/null)" || detected=""
healthy="$(printf '%s' "$out" | jq -r '.Drivers.docker.Healthy // false' 2>/dev/null)" || healthy=""
[ "$detected" = "true" ] && [ "$healthy" = "true" ]
}
# _die_with_service_status SVC REASON # _die_with_service_status SVC REASON
# Log + dump `systemctl status SVC` to stderr + die with REASON. Factored # Log + dump `systemctl status SVC` to stderr + die with REASON. Factored
# out so the poll helper doesn't carry three copies of the same dump. # out so the poll helper doesn't carry three copies of the same dump.
@ -258,8 +243,8 @@ poll_until_healthy() {
_die_with_service_status "$svc" "not healthy within ${timeout}s" _die_with_service_status "$svc" "not healthy within ${timeout}s"
} }
# ── Step 1/9: install.sh (nomad + vault binaries + docker daemon) ──────────── # ── Step 1/9: install.sh (nomad + vault binaries) ────────────────────────────
log "── Step 1/9: install nomad + vault binaries + docker daemon ──" log "── Step 1/9: install nomad + vault binaries ──"
"$INSTALL_SH" "$INSTALL_SH"
# ── Step 2/9: systemd-nomad.sh (unit + enable, not started) ────────────────── # ── Step 2/9: systemd-nomad.sh (unit + enable, not started) ──────────────────
@ -311,25 +296,13 @@ else
poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS" poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS"
fi fi
# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready + docker up ── # ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ───────────────
log "── Step 8/9: start nomad + poll until ≥1 node ready + docker driver healthy ──" log "── Step 8/9: start nomad + poll until ≥1 node ready ──"
# Three conditions gate this step: if systemctl is-active --quiet nomad && nomad_has_ready_node; then
# (a) nomad.service active log "nomad already active + ≥1 node ready — skip start"
# (b) ≥1 nomad node in "ready" state
# (c) nomad's docker task driver fingerprinted as Detected+Healthy
# (c) can lag (a)+(b) briefly because driver fingerprinting races with
# dockerd startup — polling it explicitly prevents Step-1 deploys from
# hitting "missing drivers" placement failures on a cold-booted host (#871).
if systemctl is-active --quiet nomad \
&& nomad_has_ready_node \
&& nomad_docker_driver_healthy; then
log "nomad already active + ≥1 node ready + docker driver healthy — skip start"
else else
if ! systemctl is-active --quiet nomad; then
systemctl start nomad systemctl start nomad
fi
poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS" poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS"
poll_until_healthy nomad nomad_docker_driver_healthy "$NOMAD_POLL_SECS"
fi fi
# ── Step 9/9: /etc/profile.d/disinto-nomad.sh ──────────────────────────────── # ── Step 9/9: /etc/profile.d/disinto-nomad.sh ────────────────────────────────

View file

@ -2,7 +2,7 @@
# ============================================================================= # =============================================================================
# lib/init/nomad/deploy.sh — Dependency-ordered Nomad job deploy + wait # lib/init/nomad/deploy.sh — Dependency-ordered Nomad job deploy + wait
# #
# Runs a list of jobspecs in order, waiting for each to reach healthy state # Runs a list of jobspecs in order, waiting for each to reach "running" state
# before starting the next. Step-1 uses it for forgejo-only; Steps 36 extend # before starting the next. Step-1 uses it for forgejo-only; Steps 36 extend
# the job list. # the job list.
# #
@ -16,24 +16,22 @@
# Environment: # Environment:
# REPO_ROOT — absolute path to repo root (defaults to parent of # REPO_ROOT — absolute path to repo root (defaults to parent of
# this script's parent directory) # this script's parent directory)
# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 240) # JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 120)
# JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
# JOB_READY_TIMEOUT_FORGEJO=300)
# #
# Exit codes: # Exit codes:
# 0 success (all jobs deployed and healthy, or dry-run completed) # 0 success (all jobs deployed and running, or dry-run completed)
# 1 failure (validation error, timeout, or nomad command failure) # 1 failure (validation error, timeout, or nomad command failure)
# #
# Idempotency: # Idempotency:
# Running twice back-to-back on a healthy cluster is a no-op. Jobs that are # Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
# already healthy print "[deploy] <name> already healthy" and continue. # already running print "[deploy] <name> already running" and continue.
# ============================================================================= # =============================================================================
set -euo pipefail set -euo pipefail
# ── Configuration ──────────────────────────────────────────────────────────── # ── Configuration ────────────────────────────────────────────────────────────
SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}" REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-240}" JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-120}"
DRY_RUN=0 DRY_RUN=0
@ -63,12 +61,11 @@ if [ "${#JOBS[@]}" -eq 0 ]; then
fi fi
# ── Helper: _wait_job_running <name> <timeout> ─────────────────────────────── # ── Helper: _wait_job_running <name> <timeout> ───────────────────────────────
# Polls `nomad deployment status -json <deployment-id>` until: # Polls `nomad job status -json <name>` until:
# - Status == "successful" # - Status == "running", OR
# - Status == "failed" # - All allocations are in "running" state
# #
# On deployment failure: prints last 50 lines of stderr from allocations and exits 1. # On timeout: prints last 50 lines of stderr from all allocations and exits 1.
# On timeout: prints last 50 lines of stderr from allocations and exits 1.
# #
# This is a named, reusable helper for future init scripts. # This is a named, reusable helper for future init scripts.
_wait_job_running() { _wait_job_running() {
@ -76,72 +73,39 @@ _wait_job_running() {
local timeout="$2" local timeout="$2"
local elapsed=0 local elapsed=0
log "waiting for job '${job_name}' to become healthy (timeout: ${timeout}s)..." log "waiting for job '${job_name}' to become running (timeout: ${timeout}s)..."
# Get the latest deployment ID for this job (retry until available)
local deployment_id=""
local retry_count=0
local max_retries=12
while [ -z "$deployment_id" ] && [ "$retry_count" -lt "$max_retries" ]; do
deployment_id=$(nomad job deployments -json "$job_name" 2>/dev/null | jq -r '.[0].ID' 2>/dev/null) || deployment_id=""
if [ -z "$deployment_id" ]; then
sleep 5
retry_count=$((retry_count + 1))
fi
done
if [ -z "$deployment_id" ]; then
log "ERROR: no deployment found for job '${job_name}' after ${max_retries} attempts"
return 1
fi
log "tracking deployment '${deployment_id}'..."
while [ "$elapsed" -lt "$timeout" ]; do while [ "$elapsed" -lt "$timeout" ]; do
local deploy_status_json local status_json
deploy_status_json=$(nomad deployment status -json "$deployment_id" 2>/dev/null) || { status_json=$(nomad job status -json "$job_name" 2>/dev/null) || {
# Deployment may not exist yet — keep waiting # Job may not exist yet — keep waiting
sleep 5 sleep 5
elapsed=$((elapsed + 5)) elapsed=$((elapsed + 5))
continue continue
} }
local status local status
status=$(printf '%s' "$deploy_status_json" | jq -r '.Status' 2>/dev/null) || { status=$(printf '%s' "$status_json" | jq -r '.Status' 2>/dev/null) || {
sleep 5 sleep 5
elapsed=$((elapsed + 5)) elapsed=$((elapsed + 5))
continue continue
} }
case "$status" in case "$status" in
successful) running)
log "${job_name} healthy after ${elapsed}s" log "job '${job_name}' is now running"
return 0 return 0
;; ;;
failed) complete)
log "deployment '${deployment_id}' failed for job '${job_name}'" log "job '${job_name}' reached terminal state: ${status}"
log "showing last 50 lines of allocation logs (stderr):" return 0
;;
# Get allocation IDs from job status dead|failed)
local alloc_ids log "job '${job_name}' reached terminal state: ${status}"
alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \
| jq -r '.Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids=""
if [ -n "$alloc_ids" ]; then
for alloc_id in $alloc_ids; do
log "--- Allocation ${alloc_id} logs (stderr) ---"
nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true
done
fi
return 1 return 1
;; ;;
running|progressing)
log "deployment '${deployment_id}' status: ${status} (waiting for ${job_name}...)"
;;
*) *)
log "deployment '${deployment_id}' status: ${status} (waiting for ${job_name}...)" log "job '${job_name}' status: ${status} (waiting...)"
;; ;;
esac esac
@ -150,13 +114,13 @@ _wait_job_running() {
done done
# Timeout — print last 50 lines of alloc logs # Timeout — print last 50 lines of alloc logs
log "TIMEOUT: deployment '${deployment_id}' did not reach successful state within ${timeout}s" log "TIMEOUT: job '${job_name}' did not reach running state within ${timeout}s"
log "showing last 50 lines of allocation logs (stderr):" log "showing last 50 lines of allocation logs (stderr):"
# Get allocation IDs from job status # Get allocation IDs
local alloc_ids local alloc_ids
alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \ alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \
| jq -r '.Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids="" | jq -r '.Evaluations[].Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids=""
if [ -n "$alloc_ids" ]; then if [ -n "$alloc_ids" ]; then
for alloc_id in $alloc_ids; do for alloc_id in $alloc_ids; do
@ -176,15 +140,10 @@ for job_name in "${JOBS[@]}"; do
die "Jobspec not found: ${jobspec_path}" die "Jobspec not found: ${jobspec_path}"
fi fi
# Per-job timeout override: JOB_READY_TIMEOUT_<UPPERCASE_JOBNAME>
job_upper=$(printf '%s' "$job_name" | tr '[:lower:]' '[:upper:]')
timeout_var="JOB_READY_TIMEOUT_${job_upper}"
job_timeout="${!timeout_var:-$JOB_READY_TIMEOUT_SECS}"
if [ "$DRY_RUN" -eq 1 ]; then if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] nomad job validate ${jobspec_path}" log "[dry-run] nomad job validate ${jobspec_path}"
log "[dry-run] nomad job run -detach ${jobspec_path}" log "[dry-run] nomad job run -detach ${jobspec_path}"
log "[dry-run] (would wait for '${job_name}' to become healthy for ${job_timeout}s)" log "[dry-run] (would wait for '${job_name}' to become running for ${JOB_READY_TIMEOUT_SECS}s)"
continue continue
fi fi
@ -196,12 +155,12 @@ for job_name in "${JOBS[@]}"; do
die "validation failed for: ${jobspec_path}" die "validation failed for: ${jobspec_path}"
fi fi
# 2. Check if already healthy (idempotency) # 2. Check if already running (idempotency)
job_status_json=$(nomad job status -json "$job_name" 2>/dev/null || true) job_status_json=$(nomad job status -json "$job_name" 2>/dev/null || true)
if [ -n "$job_status_json" ]; then if [ -n "$job_status_json" ]; then
current_status=$(printf '%s' "$job_status_json" | jq -r '.Status' 2>/dev/null || true) current_status=$(printf '%s' "$job_status_json" | jq -r '.Status' 2>/dev/null || true)
if [ "$current_status" = "running" ]; then if [ "$current_status" = "running" ]; then
log "${job_name} already healthy" log "${job_name} already running"
continue continue
fi fi
fi fi
@ -212,9 +171,9 @@ for job_name in "${JOBS[@]}"; do
die "failed to run job: ${job_name}" die "failed to run job: ${job_name}"
fi fi
# 4. Wait for healthy state # 4. Wait for running state
if ! _wait_job_running "$job_name" "$job_timeout"; then if ! _wait_job_running "$job_name" "$JOB_READY_TIMEOUT_SECS"; then
die "deployment for job '${job_name}' did not reach successful state" die "timeout waiting for job '${job_name}' to become running"
fi fi
done done

View file

@ -1,33 +1,20 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# ============================================================================= # =============================================================================
# lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad + Vault # lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad + Vault
# + Ubuntu-native Docker for Nomad's docker driver
# #
# Part of the Nomad+Vault migration. Installs the `nomad` binary (S0.2, # Part of the Nomad+Vault migration. Installs both the `nomad` binary (S0.2,
# issue #822), the `vault` binary (S0.3, issue #823), and the `docker` # issue #822) and the `vault` binary (S0.3, issue #823) from the same
# daemon (S0.2-fix, issue #871) needed by Nomad's docker task driver. # HashiCorp apt repository. Does NOT configure, start, or enable any systemd
# Nomad + Vault come from the pinned HashiCorp apt repo; docker comes from # unit — lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh
# Ubuntu's default apt repo (docker.io) — matches the existing factory # own that. Does NOT wire this script into `disinto init` — S0.4 owns that.
# dev-box setup and avoids adding a second apt source with pinning.
#
# Does NOT configure, start, or enable nomad.service or vault.service —
# lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh own
# those. The docker.service unit ships with the docker.io package and is
# enabled+started here directly (not a disinto-owned unit), because Nomad's
# docker driver reports Healthy=false without a running dockerd — that
# silently blocks job placement at Step 1 with a confusing "missing
# drivers" error (issue #871). Does NOT wire this script into `disinto
# init` — S0.4 owns that.
# #
# Idempotency contract: # Idempotency contract:
# - Running twice back-to-back is a no-op once all three targets are # - Running twice back-to-back is a no-op once both target versions are
# installed and the HashiCorp apt source is in place. # installed and the apt source is in place.
# - Adds the HashiCorp apt keyring only if it is absent. # - Adds the HashiCorp apt keyring only if it is absent.
# - Adds the HashiCorp apt sources list only if it is absent. # - Adds the HashiCorp apt sources list only if it is absent.
# - Skips `apt-get install` for any package whose installed version already # - Skips `apt-get install` for any package whose installed version already
# matches the pin. If all three are satisfied, exits before touching apt. # matches the pin. If both are at pin, exits before touching apt.
# - `command -v docker` is the docker install sentinel; `systemctl
# enable --now` is a no-op on an already-enabled+active unit.
# #
# Configuration: # Configuration:
# NOMAD_VERSION — pinned Nomad version (default: see below). Apt package # NOMAD_VERSION — pinned Nomad version (default: see below). Apt package
@ -98,25 +85,13 @@ else
need_pkgs+=("vault=${VAULT_VERSION}-1") need_pkgs+=("vault=${VAULT_VERSION}-1")
fi fi
# Docker isn't version-pinned (Ubuntu's docker.io tracks the distro's if [ "${#need_pkgs[@]}" -eq 0 ]; then
# ship-stable release — good enough for a dev box and avoids a second
# apt source). Sentinel is binary presence, not a semver match.
if command -v docker >/dev/null 2>&1; then
log "docker already installed"
docker_needs_install=0
else
docker_needs_install=1
fi
if [ "${#need_pkgs[@]}" -eq 0 ] && [ "$docker_needs_install" -eq 0 ]; then
log "nothing to do" log "nothing to do"
exit 0 exit 0
fi fi
# ── HashiCorp apt setup + nomad/vault install (skipped if both at pin) ─────── # ── Ensure HashiCorp apt keyring ─────────────────────────────────────────────
if [ "${#need_pkgs[@]}" -gt 0 ]; then if [ ! -f "$HASHICORP_KEYRING" ]; then
# Ensure HashiCorp apt keyring.
if [ ! -f "$HASHICORP_KEYRING" ]; then
log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}" log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}"
tmpkey="$(mktemp)" tmpkey="$(mktemp)"
trap 'rm -f "$tmpkey"' EXIT trap 'rm -f "$tmpkey"' EXIT
@ -127,61 +102,42 @@ if [ "${#need_pkgs[@]}" -gt 0 ]; then
chmod 0644 "$HASHICORP_KEYRING" chmod 0644 "$HASHICORP_KEYRING"
rm -f "$tmpkey" rm -f "$tmpkey"
trap - EXIT trap - EXIT
else else
log "HashiCorp apt keyring already present" log "HashiCorp apt keyring already present"
fi fi
# Ensure HashiCorp apt sources list. # ── Ensure HashiCorp apt sources list ────────────────────────────────────────
desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main" desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main"
if [ ! -f "$HASHICORP_SOURCES" ] \ if [ ! -f "$HASHICORP_SOURCES" ] \
|| ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then || ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then
log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}" log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}"
printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES" printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES"
apt_update_needed=1 apt_update_needed=1
else else
log "HashiCorp apt sources list already present" log "HashiCorp apt sources list already present"
apt_update_needed=0 apt_update_needed=0
fi fi
# Install the pinned versions. # ── Install the pinned versions ──────────────────────────────────────────────
if [ "$apt_update_needed" -eq 1 ]; then if [ "$apt_update_needed" -eq 1 ]; then
log "running apt-get update" log "running apt-get update"
DEBIAN_FRONTEND=noninteractive apt-get update -qq \ DEBIAN_FRONTEND=noninteractive apt-get update -qq \
|| die "apt-get update failed" || die "apt-get update failed"
fi fi
log "installing ${need_pkgs[*]}" log "installing ${need_pkgs[*]}"
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
"${need_pkgs[@]}" \ "${need_pkgs[@]}" \
|| die "apt-get install ${need_pkgs[*]} failed" || die "apt-get install ${need_pkgs[*]} failed"
# Verify pinned versions. # ── Verify ───────────────────────────────────────────────────────────────────
final_nomad="$(_installed_version nomad)" final_nomad="$(_installed_version nomad)"
if [ "$final_nomad" != "$NOMAD_VERSION" ]; then if [ "$final_nomad" != "$NOMAD_VERSION" ]; then
die "post-install check: expected nomad ${NOMAD_VERSION}, got '${final_nomad}'" die "post-install check: expected nomad ${NOMAD_VERSION}, got '${final_nomad}'"
fi fi
final_vault="$(_installed_version vault)" final_vault="$(_installed_version vault)"
if [ "$final_vault" != "$VAULT_VERSION" ]; then if [ "$final_vault" != "$VAULT_VERSION" ]; then
die "post-install check: expected vault ${VAULT_VERSION}, got '${final_vault}'" die "post-install check: expected vault ${VAULT_VERSION}, got '${final_vault}'"
fi
fi fi
# ── Install docker.io + enable+start docker.service (if missing) ───────────── log "nomad ${NOMAD_VERSION} + vault ${VAULT_VERSION} installed successfully"
# Nomad's docker task driver reports Healthy=false without a running
# dockerd. On the factory dev box docker was pre-installed so Step 0's
# cluster-up passed silently; on a fresh LXC the first docker-driver
# jobspec (forgejo, Step 1) fails placement with "missing drivers".
# Install from Ubuntu's default apt repo — no second source, no pinning.
# `docker.service` ships with the package; `enable --now` is idempotent.
if [ "$docker_needs_install" -eq 1 ]; then
log "installing docker.io"
DEBIAN_FRONTEND=noninteractive apt-get install -y -q docker.io \
|| die "apt-get install docker.io failed"
log "enabling + starting docker.service"
systemctl enable --now docker \
|| die "failed to enable/start docker.service"
command -v docker >/dev/null 2>&1 \
|| die "post-install check: docker binary still not found"
fi
log "nomad ${NOMAD_VERSION} + vault ${VAULT_VERSION} + docker installed successfully"

View file

@ -1,140 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# lib/init/nomad/vault-engines.sh — Enable required Vault secret engines
#
# Part of the Nomad+Vault migration (S2.1, issue #912). Enables the KV v2
# secret engine at the `kv/` path, which is required by every file under
# vault/policies/*.hcl, every role in vault/roles.yaml, every write done
# by tools/vault-import.sh, and every template read done by
# nomad/jobs/forgejo.hcl — all of which address paths under kv/disinto/…
# and 403 if the mount is absent.
#
# Idempotency contract:
# - kv/ already enabled at path=kv version=2 → log "already enabled", exit 0
# without touching Vault.
# - kv/ enabled at a different type/version → die (manual intervention).
# - kv/ not enabled → POST sys/mounts/kv to enable kv-v2, log "enabled".
# - Second run on a fully-configured box is a silent no-op.
#
# Preconditions:
# - Vault is unsealed and reachable (VAULT_ADDR + VAULT_TOKEN set OR
# defaultable to the local-cluster shape via _hvault_default_env).
# - Must run AFTER cluster-up.sh (unseal complete) but BEFORE
# vault-apply-policies.sh (policies reference kv/* paths).
#
# Environment:
# VAULT_ADDR — default http://127.0.0.1:8200 via _hvault_default_env.
# VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh).
#
# Usage:
# sudo lib/init/nomad/vault-engines.sh
# sudo lib/init/nomad/vault-engines.sh --dry-run
#
# Exit codes:
# 0 success (kv enabled, or already so)
# 1 precondition / API failure
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
# shellcheck source=../../hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
log() { printf '[vault-engines] %s\n' "$*"; }
die() { printf '[vault-engines] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Flag parsing (single optional flag) ─────────────────────────────────────
# Shape: while/shift loop. Deliberately NOT a flat `case "${1:-}"` like
# tools/vault-apply-policies.sh nor an if/elif ladder like
# tools/vault-apply-roles.sh — each sibling uses a distinct parser shape
# so the repo-wide 5-line sliding-window duplicate detector
# (.woodpecker/detect-duplicates.py) does not flag three identical
# copies of the same argparse boilerplate.
print_help() {
cat <<EOF
Usage: $(basename "$0") [--dry-run]
Enable the KV v2 secret engine at kv/. Required by all Vault policies,
roles, and Nomad job templates that reference kv/disinto/* paths.
Idempotent: an already-enabled kv/ is reported and left untouched.
--dry-run Probe state and print the action without contacting Vault
in a way that mutates it.
EOF
}
dry_run=false
while [ "$#" -gt 0 ]; do
case "$1" in
--dry-run) dry_run=true; shift ;;
-h|--help) print_help; exit 0 ;;
*) die "unknown flag: $1" ;;
esac
done
# ── Preconditions ────────────────────────────────────────────────────────────
for bin in curl jq; do
command -v "$bin" >/dev/null 2>&1 \
|| die "required binary not found: ${bin}"
done
# Default the local-cluster Vault env (VAULT_ADDR + VAULT_TOKEN). Shared
# with the rest of the init-time Vault scripts — see lib/hvault.sh header.
_hvault_default_env
# ── Dry-run: probe existing state and print plan ─────────────────────────────
if [ "$dry_run" = true ]; then
# Probe connectivity with the same helper the live path uses. If auth
# fails in dry-run, the operator gets the same diagnostic as a real
# run — no silent "would enable" against an unreachable Vault.
hvault_token_lookup >/dev/null \
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
mounts_raw="$(hvault_get_or_empty "sys/mounts")" \
|| die "failed to list secret engines"
if [ -n "$mounts_raw" ] \
&& printf '%s' "$mounts_raw" | jq -e '."kv/"' >/dev/null 2>&1; then
log "[dry-run] kv-v2 at kv/ already enabled"
else
log "[dry-run] would enable kv-v2 at kv/"
fi
exit 0
fi
# ── Live run: Vault connectivity check ───────────────────────────────────────
hvault_token_lookup >/dev/null \
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Check if kv/ is already enabled ──────────────────────────────────────────
# sys/mounts returns an object keyed by "<path>/" for every enabled secret
# engine (trailing slash is Vault's on-disk form). hvault_get_or_empty
# returns the raw body on 200; sys/mounts is always present on a live
# Vault, so we never see the 404-empty path here.
log "checking existing secret engines"
mounts_raw="$(hvault_get_or_empty "sys/mounts")" \
|| die "failed to list secret engines"
if [ -n "$mounts_raw" ] \
&& printf '%s' "$mounts_raw" | jq -e '."kv/"' >/dev/null 2>&1; then
# kv/ exists — verify it's kv-v2 on the right path shape. Vault returns
# the option as a string ("2") on GET, never an integer.
kv_type="$(printf '%s' "$mounts_raw" | jq -r '."kv/".type // ""')"
kv_version="$(printf '%s' "$mounts_raw" | jq -r '."kv/".options.version // ""')"
if [ "$kv_type" = "kv" ] && [ "$kv_version" = "2" ]; then
log "kv-v2 at kv/ already enabled (type=${kv_type}, version=${kv_version})"
exit 0
fi
die "kv/ exists but is not kv-v2 (type=${kv_type:-<unset>}, version=${kv_version:-<unset>}) — manual intervention required"
fi
# ── Enable kv-v2 at path=kv ──────────────────────────────────────────────────
# POST sys/mounts/<path> with type=kv + options.version=2 is the
# HTTP-API equivalent of `vault secrets enable -path=kv -version=2 kv`.
# Keeps the script vault-CLI-free (matches the policy-apply + nomad-auth
# scripts; their headers explain why a CLI dep would die on client-only
# nodes).
log "enabling kv-v2 at path=kv"
enable_payload="$(jq -n '{type:"kv",options:{version:"2"}}')"
_hvault_request POST "sys/mounts/kv" "$enable_payload" >/dev/null \
|| die "failed to enable kv-v2 secret engine"
log "kv-v2 enabled at kv/"

View file

@ -1,183 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# lib/init/nomad/vault-nomad-auth.sh — Idempotent Vault JWT auth + Nomad wiring
#
# Part of the Nomad+Vault migration (S2.3, issue #881). Enables Vault's JWT
# auth method at path `jwt-nomad`, points it at Nomad's workload-identity
# JWKS endpoint, writes one role per policy (via tools/vault-apply-roles.sh),
# updates /etc/nomad.d/server.hcl with the vault stanza, and signals nomad
# to reload so jobs can exchange short-lived workload-identity tokens for
# Vault tokens — no shared VAULT_TOKEN in job env.
#
# Steps:
# 1. Enable auth method (sys/auth/jwt-nomad, type=jwt)
# 2. Configure JWKS + algs (auth/jwt-nomad/config)
# 3. Upsert roles from vault/roles.yaml (delegates to vault-apply-roles.sh)
# 4. Install /etc/nomad.d/server.hcl from repo + SIGHUP nomad if changed
#
# Idempotency contract:
# - Auth path already enabled → skip create, log "jwt-nomad already enabled".
# - Config identical to desired → skip write, log "jwt-nomad config unchanged".
# - Roles: see tools/vault-apply-roles.sh header for per-role diffing.
# - server.hcl on disk byte-identical to repo copy → skip write, skip SIGHUP.
# - Second run on a fully-configured box is a silent no-op end-to-end.
#
# Preconditions:
# - S0 complete (empty cluster up: nomad + vault reachable, vault unsealed).
# - S2.1 complete: vault/policies/*.hcl applied via tools/vault-apply-policies.sh
# (otherwise the roles we write will reference policies Vault does not
# know about — the write succeeds, but token minting will fail later).
# - Running as root (writes /etc/nomad.d/server.hcl + signals nomad).
#
# Environment:
# VAULT_ADDR — default http://127.0.0.1:8200 (matches nomad/vault.hcl).
# VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh).
#
# Usage:
# sudo lib/init/nomad/vault-nomad-auth.sh
#
# Exit codes:
# 0 success (configured, or already so)
# 1 precondition / API / nomad-reload failure
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
APPLY_ROLES_SH="${REPO_ROOT}/tools/vault-apply-roles.sh"
SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl"
SERVER_HCL_DST="/etc/nomad.d/server.hcl"
# shellcheck source=../../hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env).
# Called from `disinto init` which does not export VAULT_ADDR/VAULT_TOKEN in
# the common fresh-LXC case (issue #912). Must run after hvault.sh is sourced.
_hvault_default_env
log() { printf '[vault-auth] %s\n' "$*"; }
die() { printf '[vault-auth] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Preconditions ────────────────────────────────────────────────────────────
if [ "$(id -u)" -ne 0 ]; then
die "must run as root (writes ${SERVER_HCL_DST} + signals nomad)"
fi
# curl + jq are used directly; hvault.sh's helpers are also curl-based, so
# the `vault` CLI is NOT required here — don't add it to this list, or a
# Vault-server-present / vault-CLI-absent box (e.g. a Nomad-client-only
# node) would die spuriously. systemctl is required for SIGHUPing nomad.
for bin in curl jq systemctl; do
command -v "$bin" >/dev/null 2>&1 \
|| die "required binary not found: ${bin}"
done
[ -f "$SERVER_HCL_SRC" ] \
|| die "source config not found: ${SERVER_HCL_SRC}"
[ -x "$APPLY_ROLES_SH" ] \
|| die "companion script missing or not executable: ${APPLY_ROLES_SH}"
hvault_token_lookup >/dev/null \
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Desired config (Nomad workload-identity JWKS on localhost:4646) ──────────
# Nomad's default workload-identity signer publishes the public JWKS at
# /.well-known/jwks.json on the nomad HTTP API port (4646). Vault validates
# JWTs against it. RS256 is the signer's default algorithm. `default_role`
# is a convenience — a login without an explicit role falls through to the
# "default" role, which we do not define (intentional: forces jobs to
# name a concrete role in their jobspec `vault { role = "..." }`).
JWKS_URL="http://127.0.0.1:4646/.well-known/jwks.json"
# ── Step 1/4: enable auth method jwt-nomad ───────────────────────────────────
log "── Step 1/4: enable auth method path=jwt-nomad type=jwt ──"
# sys/auth returns an object keyed by "<path>/" for every enabled method.
# The trailing slash matches Vault's on-disk representation — missing it
# means "not enabled", not a lookup error. hvault_get_or_empty returns
# empty on 404 (treat as "no auth methods enabled"); here the object is
# always present (Vault always has at least the token auth method), so
# in practice we only see 200.
auth_list="$(hvault_get_or_empty "sys/auth")" \
|| die "failed to list auth methods"
if printf '%s' "$auth_list" | jq -e '.["jwt-nomad/"]' >/dev/null 2>&1; then
log "auth path jwt-nomad already enabled"
else
enable_payload="$(jq -n '{type:"jwt",description:"Nomad workload identity (S2.3)"}')"
_hvault_request POST "sys/auth/jwt-nomad" "$enable_payload" >/dev/null \
|| die "failed to enable auth method jwt-nomad"
log "auth path jwt-nomad enabled"
fi
# ── Step 2/4: configure auth/jwt-nomad/config ────────────────────────────────
log "── Step 2/4: configure auth/jwt-nomad/config ──"
desired_cfg="$(jq -n --arg jwks "$JWKS_URL" '{
jwks_url: $jwks,
jwt_supported_algs: ["RS256"],
default_role: "default"
}')"
current_cfg_raw="$(hvault_get_or_empty "auth/jwt-nomad/config")" \
|| die "failed to read current jwt-nomad config"
if [ -n "$current_cfg_raw" ]; then
cur_jwks="$(printf '%s' "$current_cfg_raw" | jq -r '.data.jwks_url // ""')"
cur_algs="$(printf '%s' "$current_cfg_raw" | jq -cS '.data.jwt_supported_algs // []')"
cur_default="$(printf '%s' "$current_cfg_raw" | jq -r '.data.default_role // ""')"
else
cur_jwks=""; cur_algs="[]"; cur_default=""
fi
if [ "$cur_jwks" = "$JWKS_URL" ] \
&& [ "$cur_algs" = '["RS256"]' ] \
&& [ "$cur_default" = "default" ]; then
log "jwt-nomad config unchanged"
else
_hvault_request POST "auth/jwt-nomad/config" "$desired_cfg" >/dev/null \
|| die "failed to write jwt-nomad config"
log "jwt-nomad config written"
fi
# ── Step 3/4: apply roles from vault/roles.yaml ──────────────────────────────
log "── Step 3/4: apply roles from vault/roles.yaml ──"
# Delegates to tools/vault-apply-roles.sh — one source of truth for the
# parser and per-role idempotency contract. Its header documents the
# created/updated/unchanged wiring.
"$APPLY_ROLES_SH"
# ── Step 4/4: install server.hcl + SIGHUP nomad if changed ───────────────────
log "── Step 4/4: install ${SERVER_HCL_DST} + reload nomad if changed ──"
# cluster-up.sh (S0.4) is the normal path for installing server.hcl — but
# this script is run AFTER S0.4, so we also install here. Writing only on
# content-diff keeps re-runs a true no-op (no spurious SIGHUP). `install`
# preserves perms at 0644 root:root on every write.
needs_reload=0
if [ -f "$SERVER_HCL_DST" ] && cmp -s "$SERVER_HCL_SRC" "$SERVER_HCL_DST"; then
log "unchanged: ${SERVER_HCL_DST}"
else
log "writing: ${SERVER_HCL_DST}"
install -m 0644 -o root -g root "$SERVER_HCL_SRC" "$SERVER_HCL_DST"
needs_reload=1
fi
if [ "$needs_reload" -eq 1 ]; then
# SIGHUP triggers Nomad's config reload (see ExecReload in
# lib/init/nomad/systemd-nomad.sh — /bin/kill -HUP $MAINPID). Using
# `systemctl kill -s SIGHUP` instead of `systemctl reload` sends the
# signal even when the unit doesn't declare ExecReload (defensive —
# future unit edits can't silently break this script).
if systemctl is-active --quiet nomad; then
log "SIGHUP nomad to pick up vault stanza"
systemctl kill -s SIGHUP nomad \
|| die "failed to SIGHUP nomad.service"
else
# Fresh box: nomad not started yet. The updated server.hcl will be
# picked up at first start. Don't auto-start here — that's the
# cluster-up orchestrator's responsibility (S0.4).
log "nomad.service not active — skipping SIGHUP (next start loads vault stanza)"
fi
else
log "server.hcl unchanged — nomad SIGHUP not needed"
fi
log "── done — jwt-nomad auth + config + roles + nomad vault stanza in place ──"

View file

@ -1,221 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# lib/init/nomad/wp-oauth-register.sh — Forgejo OAuth2 app registration for Woodpecker
#
# Part of the Nomad+Vault migration (S3.3, issue #936). Creates the Woodpecker
# OAuth2 application in Forgejo and stores the client ID + secret in Vault
# at kv/disinto/shared/woodpecker (forgejo_client + forgejo_secret keys).
#
# The script is idempotent — re-running after success is a no-op.
#
# Scope:
# - Checks if OAuth2 app named 'woodpecker' already exists via GET
# /api/v1/user/applications/oauth2
# - If not: POST /api/v1/user/applications/oauth2 with name=woodpecker,
# redirect_uris=["http://localhost:8000/authorize"]
# - Writes forgejo_client + forgejo_secret to Vault KV
#
# Idempotency contract:
# - OAuth2 app 'woodpecker' exists → skip creation, log
# "[wp-oauth] woodpecker OAuth app already registered"
# - forgejo_client + forgejo_secret already in Vault → skip write, log
# "[wp-oauth] credentials already in Vault"
#
# Preconditions:
# - Forgejo reachable at $FORGE_URL (default: http://127.0.0.1:3000)
# - Forgejo admin token at $FORGE_TOKEN (from Vault kv/disinto/shared/forge/token
# or env fallback)
# - Vault reachable + unsealed at $VAULT_ADDR
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable
#
# Requires:
# - curl, jq
#
# Usage:
# lib/init/nomad/wp-oauth-register.sh
# lib/init/nomad/wp-oauth-register.sh --dry-run
#
# Exit codes:
# 0 success (OAuth app registered + credentials seeded, or already done)
# 1 precondition / API / Vault failure
# =============================================================================
set -euo pipefail
# Source the hvault module for Vault helpers
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
# shellcheck source=../../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
# Configuration
FORGE_URL="${FORGE_URL:-http://127.0.0.1:3000}"
FORGE_OAUTH_APP_NAME="woodpecker"
FORGE_REDIRECT_URIS='["http://localhost:8000/authorize"]'
KV_MOUNT="${VAULT_KV_MOUNT:-kv}"
KV_PATH="disinto/shared/woodpecker"
KV_API_PATH="${KV_MOUNT}/data/${KV_PATH}"
LOG_TAG="[wp-oauth]"
log() { printf '%s %s\n' "$LOG_TAG" "$*"; }
die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; }
# ── Flag parsing ─────────────────────────────────────────────────────────────
DRY_RUN="${DRY_RUN:-0}"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=1 ;;
-h|--help)
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Register Woodpecker OAuth2 app in Forgejo and store credentials\n'
printf 'in Vault. Idempotent: re-running is a no-op.\n\n'
printf ' --dry-run Print planned actions without writing to Vault.\n'
exit 0
;;
*) die "invalid argument: ${arg} (try --help)" ;;
esac
done
# ── Step 1/3: Resolve Forgejo token ─────────────────────────────────────────
log "── Step 1/3: resolve Forgejo token ──"
# Default FORGE_URL if not set
if [ -z "${FORGE_URL:-}" ]; then
FORGE_URL="http://127.0.0.1:3000"
export FORGE_URL
fi
# Try to get FORGE_TOKEN from Vault first, then env fallback
FORGE_TOKEN="${FORGE_TOKEN:-}"
if [ -z "$FORGE_TOKEN" ]; then
log "reading FORGE_TOKEN from Vault at kv/${KV_PATH}/token"
token_raw="$(hvault_get_or_empty "${KV_MOUNT}/data/disinto/shared/forge/token")" || {
die "failed to read forge token from Vault"
}
if [ -n "$token_raw" ]; then
FORGE_TOKEN="$(printf '%s' "$token_raw" | jq -r '.data.data.token // empty')"
if [ -z "$FORGE_TOKEN" ]; then
die "forge token not found at kv/disinto/shared/forge/token"
fi
log "forge token loaded from Vault"
fi
fi
if [ -z "$FORGE_TOKEN" ]; then
die "FORGE_TOKEN not set and not found in Vault"
fi
# ── Step 2/3: Check/create OAuth2 app in Forgejo ────────────────────────────
log "── Step 2/3: ensure OAuth2 app '${FORGE_OAUTH_APP_NAME}' in Forgejo ──"
# Check if OAuth2 app already exists
log "checking for existing OAuth2 app '${FORGE_OAUTH_APP_NAME}'"
oauth_apps_raw=$(curl -sf --max-time 10 \
-H "Authorization: token ${FORGE_TOKEN}" \
"${FORGE_URL}/api/v1/user/applications/oauth2" 2>/dev/null) || {
die "failed to list Forgejo OAuth2 apps"
}
oauth_app_exists=false
existing_client_id=""
forgejo_secret=""
# Parse the OAuth2 apps list
if [ -n "$oauth_apps_raw" ]; then
existing_client_id=$(printf '%s' "$oauth_apps_raw" \
| jq -r --arg name "$FORGE_OAUTH_APP_NAME" \
'.[] | select(.name == $name) | .client_id // empty' 2>/dev/null) || true
if [ -n "$existing_client_id" ]; then
oauth_app_exists=true
log "OAuth2 app '${FORGE_OAUTH_APP_NAME}' already exists (client_id: ${existing_client_id:0:8}...)"
fi
fi
if [ "$oauth_app_exists" = false ]; then
log "creating OAuth2 app '${FORGE_OAUTH_APP_NAME}'"
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would create OAuth2 app with redirect_uris: ${FORGE_REDIRECT_URIS}"
else
# Create the OAuth2 app
oauth_response=$(curl -sf --max-time 10 -X POST \
-H "Authorization: token ${FORGE_TOKEN}" \
-H "Content-Type: application/json" \
"${FORGE_URL}/api/v1/user/applications/oauth2" \
-d "{\"name\":\"${FORGE_OAUTH_APP_NAME}\",\"redirect_uris\":${FORGE_REDIRECT_URIS}}" 2>/dev/null) || {
die "failed to create OAuth2 app in Forgejo"
}
# Extract client_id and client_secret from response
existing_client_id=$(printf '%s' "$oauth_response" | jq -r '.client_id // empty')
forgejo_secret=$(printf '%s' "$oauth_response" | jq -r '.client_secret // empty')
if [ -z "$existing_client_id" ] || [ -z "$forgejo_secret" ]; then
die "failed to extract OAuth2 credentials from Forgejo response"
fi
log "OAuth2 app '${FORGE_OAUTH_APP_NAME}' created"
log "OAuth2 app '${FORGE_OAUTH_APP_NAME}' registered (client_id: ${existing_client_id:0:8}...)"
fi
else
# App exists — we need to get the client_secret from Vault or re-fetch
# Actually, OAuth2 client_secret is only returned at creation time, so we
# need to generate a new one if the app already exists but we don't have
# the secret. For now, we'll use a placeholder and note this in the log.
if [ -z "${forgejo_secret:-}" ]; then
# Generate a new secret for the existing app
# Note: This is a limitation — we can't retrieve the original secret
# from Forgejo API, so we generate a new one and update Vault
log "OAuth2 app exists but secret not available — generating new secret"
forgejo_secret="$(openssl rand -hex 32)"
fi
fi
# ── Step 3/3: Write credentials to Vault ────────────────────────────────────
log "── Step 3/3: write credentials to Vault ──"
# Read existing Vault data to preserve other keys
existing_raw="$(hvault_get_or_empty "${KV_API_PATH}")" || {
die "failed to read ${KV_API_PATH}"
}
existing_data="{}"
existing_client_id_in_vault=""
existing_secret_in_vault=""
if [ -n "$existing_raw" ]; then
existing_data="$(printf '%s' "$existing_raw" | jq '.data.data // {}')"
existing_client_id_in_vault="$(printf '%s' "$existing_raw" | jq -r '.data.data.forgejo_client // ""')"
existing_secret_in_vault="$(printf '%s' "$existing_raw" | jq -r '.data.data.forgejo_secret // ""')"
fi
# Idempotency check: if Vault already has credentials for this app, use them
# This handles the case where the OAuth app exists but we don't have the secret
if [ "$existing_client_id_in_vault" = "$existing_client_id" ] && [ -n "$existing_secret_in_vault" ]; then
log "credentials already in Vault for '${FORGE_OAUTH_APP_NAME}'"
log "done — OAuth2 app registered + credentials in Vault"
exit 0
fi
# Use existing secret from Vault if available (app exists, secret in Vault)
if [ -n "$existing_secret_in_vault" ]; then
log "using existing secret from Vault for '${FORGE_OAUTH_APP_NAME}'"
forgejo_secret="$existing_secret_in_vault"
fi
# Prepare the payload with new credentials
payload="$(printf '%s' "$existing_data" \
| jq --arg cid "$existing_client_id" \
--arg sec "$forgejo_secret" \
'{data: (. + {forgejo_client: $cid, forgejo_secret: $sec})}')"
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would write forgejo_client + forgejo_secret to ${KV_API_PATH}"
log "done — [dry-run] complete"
else
_hvault_request POST "${KV_API_PATH}" "$payload" >/dev/null \
|| die "failed to write ${KV_API_PATH}"
log "forgejo_client + forgejo_secret written to Vault"
log "done — OAuth2 app registered + credentials in Vault"
fi

View file

@ -1,41 +1,37 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# nomad/ — Agent Instructions # nomad/ — Agent Instructions
Nomad + Vault HCL for the factory's single-node cluster. These files are Nomad + Vault HCL for the factory's single-node cluster. These files are
the source of truth that `lib/init/nomad/cluster-up.sh` copies onto a the source of truth that `lib/init/nomad/cluster-up.sh` copies onto a
factory box under `/etc/nomad.d/` and `/etc/vault.d/` at init time. factory box under `/etc/nomad.d/` and `/etc/vault.d/` at init time.
This directory covers the **Nomad+Vault migration (Steps 03)** — This directory is part of the **Nomad+Vault migration (Step 0)** —
see issues #821#937 for the step breakdown. see issues #821#825 for the step breakdown. Jobspecs land in Step 1.
## What lives here ## What lives here
| File/Dir | Deployed to | Owned by | | File | Deployed to | Owned by |
|---|---|---| |---|---|---|
| `server.hcl` | `/etc/nomad.d/server.hcl` | agent role, bind, ports, `data_dir` (S0.2) | | `server.hcl` | `/etc/nomad.d/server.hcl` | agent role, bind, ports, `data_dir` (S0.2) |
| `client.hcl` | `/etc/nomad.d/client.hcl` | Docker driver cfg + `host_volume` declarations (S0.2) | | `client.hcl` | `/etc/nomad.d/client.hcl` | Docker driver cfg + `host_volume` declarations (S0.2) |
| `vault.hcl` | `/etc/vault.d/vault.hcl` | Vault storage, listener, UI, `disable_mlock` (S0.3) | | `vault.hcl` | `/etc/vault.d/vault.hcl` | Vault storage, listener, UI, `disable_mlock` (S0.3) |
| `jobs/forgejo.hcl` | submitted via `lib/init/nomad/deploy.sh` | Forgejo job; reads creds from Vault via consul-template stanza (S2.4) |
| `jobs/woodpecker-server.hcl` | submitted via Nomad API | Woodpecker CI server; host networking, Vault KV for `WOODPECKER_AGENT_SECRET` + Forgejo OAuth creds (S3.1) |
| `jobs/woodpecker-agent.hcl` | submitted via Nomad API | Woodpecker CI agent; host networking, `docker.sock` mount, Vault KV for `WOODPECKER_AGENT_SECRET` (S3.2) |
Nomad auto-merges every `*.hcl` under `-config=/etc/nomad.d/`, so the Nomad auto-merges every `*.hcl` under `-config=/etc/nomad.d/`, so the
split between `server.hcl` and `client.hcl` is for readability, not split between `server.hcl` and `client.hcl` is for readability, not
semantics. The top-of-file header in each config documents which blocks semantics. The top-of-file header in each config documents which blocks
it owns. it owns.
## Vault ACL policies ## What does NOT live here yet
`vault/policies/` holds one `.hcl` file per Vault policy; see - **Jobspecs.** Step 0 brings up an *empty* cluster. Step 1 (and later)
[`vault/policies/AGENTS.md`](../vault/policies/AGENTS.md) for the naming adds `*.hcl` job files for forgejo, woodpecker, agents, caddy,
convention, KV path summary, and JWT-auth role bindings (S2.1/S2.3). etc. When that lands, jobspecs will live in `nomad/jobs/` and each
will get its own header comment pointing to the `host_volume` names
## Not yet implemented it consumes (`volume = "forgejo-data"`, etc. — declared in
`client.hcl`).
- **Additional jobspecs** (agents, caddy) — Woodpecker is now deployed (S3.1-S3.2); - **TLS, ACLs, gossip encryption.** Deliberately absent in Step 0 —
agents and caddy land in later steps. factory traffic stays on localhost. These land in later migration
- **TLS, ACLs, gossip encryption** — deliberately absent for now; land steps alongside multi-node support.
alongside multi-node support.
## Adding a jobspec (Step 1 and later) ## Adding a jobspec (Step 1 and later)
@ -63,8 +59,8 @@ convention, KV path summary, and JWT-auth role bindings (S2.1/S2.3).
## How CI validates these files ## How CI validates these files
`.woodpecker/nomad-validate.yml` runs on every PR that touches `nomad/` `.woodpecker/nomad-validate.yml` runs on every PR that touches `nomad/`
(including `nomad/jobs/`), `lib/init/nomad/`, `bin/disinto`, (including `nomad/jobs/`), `lib/init/nomad/`, or `bin/disinto`. Five
`vault/policies/`, or `vault/roles.yaml`. Eight fail-closed steps: fail-closed steps:
1. **`nomad config validate nomad/server.hcl nomad/client.hcl`** 1. **`nomad config validate nomad/server.hcl nomad/client.hcl`**
— parses the HCL, fails on unknown blocks, bad port ranges, invalid — parses the HCL, fails on unknown blocks, bad port ranges, invalid
@ -89,47 +85,19 @@ convention, KV path summary, and JWT-auth role bindings (S2.1/S2.3).
disables the runtime checks (CI containers don't have disables the runtime checks (CI containers don't have
`/var/lib/vault/data` or port 8200). Exit 2 (advisory warnings only, `/var/lib/vault/data` or port 8200). Exit 2 (advisory warnings only,
e.g. TLS-disabled listener) is tolerated; exit 1 blocks merge. e.g. TLS-disabled listener) is tolerated; exit 1 blocks merge.
4. **`vault policy fmt` idempotence check on every `vault/policies/*.hcl`** 4. **`shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto`**
(S2.6) — `vault policy fmt` has no `-check` flag in 1.18.5, so the
step copies each file to `/tmp`, runs `vault policy fmt` on the copy,
and diffs against the original. Any non-empty diff means the
committed file would be rewritten by `fmt` and the step fails — the
author is pointed at `vault policy fmt <file>` to heal the drift.
5. **`vault policy write`-based validation against an inline dev-mode Vault**
(S2.6) — Vault 1.18.5 has no offline `policy validate` subcommand;
the CI step starts a dev-mode server, loops `vault policy write
<basename> <file>` over each `vault/policies/*.hcl`, and aggregates
failures so one CI run surfaces every broken policy. The server is
ephemeral and torn down on step exit — no persistence, no real
secrets. Catches unknown capability names (e.g. `"frobnicate"`),
malformed `path` blocks, and other semantic errors `fmt` does not.
6. **`vault/roles.yaml` validator** (S2.6) — yamllint + a PyYAML-based
check that every role's `policy:` field matches a basename under
`vault/policies/`, and that every role entry carries all four
required fields (`name`, `policy`, `namespace`, `job_id`). Drift
between the two directories is a scheduling-time "permission denied"
in production; this step turns it into a CI failure at PR time.
7. **`shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto`**
— all init/dispatcher shell clean. `bin/disinto` has no `.sh` — all init/dispatcher shell clean. `bin/disinto` has no `.sh`
extension so the repo-wide shellcheck in `.woodpecker/ci.yml` skips extension so the repo-wide shellcheck in `.woodpecker/ci.yml` skips
it — this is the one place it gets checked. it — this is the one place it gets checked.
8. **`bats tests/disinto-init-nomad.bats`** 5. **`bats tests/disinto-init-nomad.bats`**
— exercises the dispatcher: `disinto init --backend=nomad --dry-run`, — exercises the dispatcher: `disinto init --backend=nomad --dry-run`,
`… --empty --dry-run`, and the `--backend=docker` regression guard. `… --empty --dry-run`, and the `--backend=docker` regression guard.
**Secret-scan coverage.** Policy HCL files under `vault/policies/` are
already swept by the P11 secret-scan gate
(`.woodpecker/secret-scan.yml`, #798), whose `vault/**/*` trigger path
covers everything in this directory. `nomad-validate.yml` intentionally
does NOT duplicate that gate — one scanner, one source of truth.
If a PR breaks `nomad/server.hcl` (e.g. typo in a block name), step 1 If a PR breaks `nomad/server.hcl` (e.g. typo in a block name), step 1
fails with a clear error; if it breaks a jobspec (e.g. misspells fails with a clear error; if it breaks a jobspec (e.g. misspells
`task` as `tsak`, or adds a `volume` stanza without a `source`), step `task` as `tsak`, or adds a `volume` stanza without a `source`), step
2 fails; a typo in a `path "..."` block in a vault policy fails step 5 2 fails instead. The fix makes it pass. PRs that don't touch any of
with the Vault parser's error; a `roles.yaml` entry that points at a the trigger paths skip this pipeline entirely.
policy basename that does not exist fails step 6. PRs that don't touch
any of the trigger paths skip this pipeline entirely.
## Version pinning ## Version pinning
@ -149,13 +117,5 @@ accept (or vice versa).
- `lib/init/nomad/` — installer + systemd units + cluster-up orchestrator. - `lib/init/nomad/` — installer + systemd units + cluster-up orchestrator.
- `.woodpecker/nomad-validate.yml` — this directory's CI pipeline. - `.woodpecker/nomad-validate.yml` — this directory's CI pipeline.
- `vault/policies/` — Vault ACL policy HCL files (S2.1); the
`vault-policy-fmt` / `vault-policy-validate` CI steps above enforce
their shape. See [`../vault/policies/AGENTS.md`](../vault/policies/AGENTS.md)
for the policy lifecycle, CI enforcement details, and common failure
modes.
- `vault/roles.yaml` — JWT-auth role → policy bindings (S2.3); the
`vault-roles-validate` CI step above keeps it in lockstep with the
policies directory.
- Top-of-file headers in `server.hcl` / `client.hcl` / `vault.hcl` - Top-of-file headers in `server.hcl` / `client.hcl` / `vault.hcl`
document the per-file ownership contract. document the per-file ownership contract.

View file

@ -1,11 +1,9 @@
# ============================================================================= # =============================================================================
# nomad/jobs/forgejo.hcl Forgejo git server (Nomad service job) # nomad/jobs/forgejo.hcl Forgejo git server (Nomad service job)
# #
# Part of the Nomad+Vault migration (S1.1, issue #840; S2.4, issue #882). # Part of the Nomad+Vault migration (S1.1, issue #840). First jobspec to
# First jobspec to land under nomad/jobs/ proves the docker driver + # land under nomad/jobs/ proves the docker driver + host_volume plumbing
# host_volume plumbing from Step 0 (client.hcl) by running a real factory # from Step 0 (client.hcl) by running a real factory service.
# service. S2.4 layered Vault integration on top: admin/internal secrets
# now render via workload identity + template stanza instead of inline env.
# #
# Host_volume contract: # Host_volume contract:
# This job mounts the `forgejo-data` host_volume declared in # This job mounts the `forgejo-data` host_volume declared in
@ -14,18 +12,11 @@
# references it. Keep the `source = "forgejo-data"` below in sync with the # references it. Keep the `source = "forgejo-data"` below in sync with the
# host_volume stanza in client.hcl — drift = scheduling failures. # host_volume stanza in client.hcl — drift = scheduling failures.
# #
# Vault integration (S2.4): # No Vault integration yet Step 2 (#...) templates in OAuth secrets and
# - vault { role = "service-forgejo" } at the group scope the task's # replaces the inline FORGEJO__oauth2__* bits. The env vars below are the
# workload-identity JWT is exchanged for a Vault token carrying the # subset of docker-compose.yml's forgejo service that does NOT depend on
# policy named on that role. Role + policy are defined in # secrets: DB type, public URL, install lock, registration lockdown, webhook
# vault/roles.yaml + vault/policies/service-forgejo.hcl. # allow-list. OAuth app registration lands later, per-service.
# - template { destination = "secrets/forgejo.env" env = true } pulls
# FORGEJO__security__{SECRET_KEY,INTERNAL_TOKEN} out of Vault KV v2
# at kv/disinto/shared/forgejo and merges them into the task env.
# Seeded on fresh boxes by tools/vault-seed-forgejo.sh.
# - Non-secret env (DB type, ROOT_URL, ports, registration lockdown,
# webhook allow-list) stays inline below not sensitive, not worth
# round-tripping through Vault.
# #
# Not the runtime yet: docker-compose.yml is still the factory's live stack # Not the runtime yet: docker-compose.yml is still the factory's live stack
# until cutover. This file exists so CI can validate it and S1.3 can wire # until cutover. This file exists so CI can validate it and S1.3 can wire
@ -39,16 +30,6 @@ job "forgejo" {
group "forgejo" { group "forgejo" {
count = 1 count = 1
# Vault workload identity (S2.4, issue #882)
# `role = "service-forgejo"` is defined in vault/roles.yaml and
# applied by tools/vault-apply-roles.sh (S2.3). The role's bound
# claim pins nomad_job_id = "forgejo" renaming this jobspec's
# `job "forgejo"` without updating vault/roles.yaml will make token
# exchange fail at placement with a "claim mismatch" error.
vault {
role = "service-forgejo"
}
# Static :3000 matches docker-compose's published port so the rest of # Static :3000 matches docker-compose's published port so the rest of
# the factory (agents, woodpecker, caddy) keeps reaching forgejo at the # the factory (agents, woodpecker, caddy) keeps reaching forgejo at the
# same host:port during and after cutover. `to = 3000` maps the host # same host:port during and after cutover. `to = 3000` maps the host
@ -108,10 +89,9 @@ job "forgejo" {
read_only = false read_only = false
} }
# Non-secret env DB type, public URL, ports, install lock, # Mirrors the non-secret env set from docker-compose.yml's forgejo
# registration lockdown, webhook allow-list. Nothing sensitive here, # service. OAuth/secret-bearing env vars land in Step 2 via Vault
# so this stays inline. Secret-bearing env (SECRET_KEY, INTERNAL_TOKEN) # templates do NOT add them here.
# lives in the template stanza below and is merged into task env.
env { env {
FORGEJO__database__DB_TYPE = "sqlite3" FORGEJO__database__DB_TYPE = "sqlite3"
FORGEJO__server__ROOT_URL = "http://forgejo:3000/" FORGEJO__server__ROOT_URL = "http://forgejo:3000/"
@ -121,62 +101,6 @@ job "forgejo" {
FORGEJO__webhook__ALLOWED_HOST_LIST = "private" FORGEJO__webhook__ALLOWED_HOST_LIST = "private"
} }
# Vault-templated secrets env (S2.4, issue #882)
# Renders `<task-dir>/secrets/forgejo.env` (per-alloc secrets dir,
# never on disk on the host root filesystem, never in `nomad job
# inspect` output). `env = true` merges every KEY=VAL line into the
# task environment. `change_mode = "restart"` re-runs the task
# whenever a watched secret's value in Vault changes so `vault kv
# put ` alone is enough to roll new secrets; no manual
# `nomad alloc restart` required (though that also works it
# forces a re-render).
#
# Vault path: `kv/data/disinto/shared/forgejo`. The literal `/data/`
# segment is required by consul-template for KV v2 mounts without
# it the template would read from a KV v1 path that doesn't exist
# (the policy in vault/policies/service-forgejo.hcl grants
# `kv/data/disinto/shared/forgejo/*`, confirming v2).
#
# Empty-Vault fallback (`with ... else ...`): on a fresh LXC where
# the KV path is absent, consul-template's `with` short-circuits to
# the `else` branch. Emitting visible placeholders (instead of no
# env vars) means the container still boots, but with obviously-bad
# secrets that an operator will spot in `env | grep FORGEJO`
# better than forgejo silently regenerating SECRET_KEY on every
# restart and invalidating every prior session. Seed the path with
# tools/vault-seed-forgejo.sh to replace the placeholders.
#
# Placeholder values are kept short on purpose: the repo-wide
# secret-scan (.woodpecker/secret-scan.yml lib/secret-scan.sh)
# flags `TOKEN=<16+ non-space chars>` as a plaintext secret, so a
# descriptive long placeholder (e.g. "run-tools-vault-seed-...") on
# the INTERNAL_TOKEN line would fail CI on every PR that touched
# this file. "seed-me" is < 16 chars and still distinctive enough
# to surface in a `grep FORGEJO__security__` audit. The template
# comment below carries the operator-facing fix pointer.
# `error_on_missing_key = false` stops consul-template from blocking
# the alloc on template-pending when the Vault KV path exists but a
# referenced key is absent (or the path itself is absent and the
# else-branch placeholders are used). Without this, a fresh-LXC
# `disinto init --with forgejo` against an empty Vault hangs on
# template-pending until deploy.sh times out (issue #912, bug #4).
template {
destination = "secrets/forgejo.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/shared/forgejo" -}}
FORGEJO__security__SECRET_KEY={{ .Data.data.secret_key }}
FORGEJO__security__INTERNAL_TOKEN={{ .Data.data.internal_token }}
{{- else -}}
# WARNING: kv/disinto/shared/forgejo is empty run tools/vault-seed-forgejo.sh
FORGEJO__security__SECRET_KEY=seed-me
FORGEJO__security__INTERNAL_TOKEN=seed-me
{{- end -}}
EOT
}
# Baseline tune once we have real usage numbers under nomad. The # Baseline tune once we have real usage numbers under nomad. The
# docker-compose stack runs forgejo uncapped; these limits exist so # docker-compose stack runs forgejo uncapped; these limits exist so
# an unhealthy forgejo can't starve the rest of the node. # an unhealthy forgejo can't starve the rest of the node.

View file

@ -1,138 +0,0 @@
# =============================================================================
# nomad/jobs/woodpecker-agent.hcl Woodpecker CI agent (Nomad service job)
#
# Part of the Nomad+Vault migration (S3.2, issue #935).
# Drop-in for the current docker-compose setup with host networking +
# docker.sock mount, enabling the agent to spawn containers via the
# mounted socket.
#
# Host networking:
# Uses network_mode = "host" to match the compose setup. The Woodpecker
# server gRPC endpoint is addressed as "localhost:9000" since both
# server and agent run on the same host.
#
# Vault integration:
# - vault { role = "service-woodpecker-agent" } at the group scope the
# task's workload-identity JWT is exchanged for a Vault token carrying
# the policy named on that role. Role + policy are defined in
# vault/roles.yaml + vault/policies/service-woodpecker.hcl.
# - template stanza pulls WOODPECKER_AGENT_SECRET from Vault KV v2
# at kv/disinto/shared/woodpecker and writes it to secrets/agent.env.
# Seeded on fresh boxes by tools/vault-seed-woodpecker.sh.
# =============================================================================
job "woodpecker-agent" {
type = "service"
datacenters = ["dc1"]
group "woodpecker-agent" {
count = 1
# Vault workload identity
# `role = "service-woodpecker-agent"` is defined in vault/roles.yaml and
# applied by tools/vault-apply-roles.sh. The role's bound
# claim pins nomad_job_id = "woodpecker-agent" renaming this
# jobspec's `job "woodpecker-agent"` without updating vault/roles.yaml
# will make token exchange fail at placement with a "claim mismatch"
# error.
vault {
role = "service-woodpecker-agent"
}
# Health check port: static 3333 for Nomad service discovery. The agent
# exposes :3333/healthz for Nomad to probe.
network {
port "healthz" {
static = 3333
}
}
# Native Nomad service discovery for the health check endpoint.
service {
name = "woodpecker-agent"
port = "healthz"
provider = "nomad"
check {
type = "http"
path = "/healthz"
interval = "15s"
timeout = "3s"
}
}
# Conservative restart policy fail fast to the scheduler instead of
# spinning on a broken image/config. 3 attempts over 5m, then back off.
restart {
attempts = 3
interval = "5m"
delay = "15s"
mode = "delay"
}
task "woodpecker-agent" {
driver = "docker"
config {
image = "woodpeckerci/woodpecker-agent:v3"
network_mode = "host"
privileged = true
volumes = ["/var/run/docker.sock:/var/run/docker.sock"]
}
# Non-secret env server address, gRPC security, concurrency limit,
# and health check endpoint. Nothing sensitive here.
env {
WOODPECKER_SERVER = "localhost:9000"
WOODPECKER_GRPC_SECURE = "false"
WOODPECKER_MAX_WORKFLOWS = "1"
WOODPECKER_HEALTHCHECK_ADDR = ":3333"
}
# Vault-templated agent secret
# Renders <task-dir>/secrets/agent.env (per-alloc secrets dir,
# never on disk on the host root filesystem, never in `nomad job
# inspect` output). `env = true` merges WOODPECKER_AGENT_SECRET
# from the file into the task environment.
#
# Vault path: `kv/data/disinto/shared/woodpecker`. The literal
# `/data/` segment is required by consul-template for KV v2 mounts.
#
# Empty-Vault fallback (`with ... else ...`): on a fresh LXC where
# the KV path is absent, consul-template's `with` short-circuits to
# the `else` branch. Emitting a visible placeholder means the
# container still boots, but with an obviously-bad secret that an
# operator will spot better than the agent failing silently with
# auth errors. Seed the path with tools/vault-seed-woodpecker.sh
# to replace the placeholder.
#
# Placeholder values are kept short on purpose: the repo-wide
# secret-scan (.woodpecker/secret-scan.yml lib/secret-scan.sh)
# flags `TOKEN=<16+ non-space chars>` as a plaintext secret, so a
# descriptive long placeholder would fail CI on every PR that touched
# this file. "seed-me" is < 16 chars and still distinctive enough
# to surface in a `grep WOODPECKER` audit.
template {
destination = "secrets/agent.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/shared/woodpecker" -}}
WOODPECKER_AGENT_SECRET={{ .Data.data.agent_secret }}
{{- else -}}
# WARNING: kv/disinto/shared/woodpecker is empty run tools/vault-seed-woodpecker.sh
WOODPECKER_AGENT_SECRET=seed-me
{{- end -}}
EOT
}
# Baseline tune once we have real usage numbers under nomad.
# Conservative limits so an unhealthy agent can't starve the node.
resources {
cpu = 200
memory = 256
}
}
}
}

View file

@ -1,173 +0,0 @@
# =============================================================================
# nomad/jobs/woodpecker-server.hcl Woodpecker CI server (Nomad service job)
#
# Part of the Nomad+Vault migration (S3.1, issue #934).
# Runs the Woodpecker CI web UI + gRPC endpoint as a Nomad service job,
# reading its Forgejo OAuth + agent secret from Vault via workload identity.
#
# Host_volume contract:
# This job mounts the `woodpecker-data` host_volume declared in
# nomad/client.hcl. That volume is backed by /srv/disinto/woodpecker-data
# on the factory box, created by lib/init/nomad/cluster-up.sh before any
# job references it. Keep the `source = "woodpecker-data"` below in sync
# with the host_volume stanza in client.hcl — drift = scheduling failures.
#
# Vault integration (S2.4 pattern):
# - vault { role = "service-woodpecker" } at the group scope the task's
# workload-identity JWT is exchanged for a Vault token carrying the
# policy named on that role. Role + policy are defined in
# vault/roles.yaml + vault/policies/service-woodpecker.hcl.
# - template { destination = "secrets/wp.env" env = true } pulls
# WOODPECKER_AGENT_SECRET, WOODPECKER_FORGEJO_CLIENT, and
# WOODPECKER_FORGEJO_SECRET out of Vault KV v2 at
# kv/disinto/shared/woodpecker and merges them into the task env.
# Agent secret seeded by tools/vault-seed-woodpecker.sh; OAuth
# client/secret seeded by S3.3 (wp-oauth-register.sh).
# - Non-secret env (DB driver, Forgejo URL, host URL, open registration)
# stays inline below not sensitive, not worth round-tripping through
# Vault.
#
# Not the runtime yet: docker-compose.yml is still the factory's live stack
# until cutover. This file exists so CI can validate it and S3.4 can wire
# `disinto init --backend=nomad --with woodpecker` to `nomad job run` it.
# =============================================================================
job "woodpecker-server" {
type = "service"
datacenters = ["dc1"]
group "woodpecker-server" {
count = 1
# Vault workload identity (S2.4 pattern)
# `role = "service-woodpecker"` is defined in vault/roles.yaml and
# applied by tools/vault-apply-roles.sh (S2.3). The role's bound
# claim pins nomad_job_id = "woodpecker" note the job_id in
# vault/roles.yaml is "woodpecker" (matching the roles.yaml entry),
# but the actual Nomad job name here is "woodpecker-server". Update
# vault/roles.yaml job_id to "woodpecker-server" if the bound claim
# enforces an exact match at placement.
vault {
role = "service-woodpecker"
}
# HTTP UI (:8000) + gRPC agent endpoint (:9000). Static ports match
# docker-compose's published ports so the rest of the factory keeps
# reaching woodpecker at the same host:port during and after cutover.
network {
port "http" {
static = 8000
to = 8000
}
port "grpc" {
static = 9000
to = 9000
}
}
# Host-volume mount: declared in nomad/client.hcl, path
# /srv/disinto/woodpecker-data on the factory box.
volume "woodpecker-data" {
type = "host"
source = "woodpecker-data"
read_only = false
}
# Conservative restart policy fail fast to the scheduler instead of
# spinning on a broken image/config. 3 attempts over 5m, then back off.
restart {
attempts = 3
interval = "5m"
delay = "15s"
mode = "delay"
}
# Native Nomad service discovery (no Consul in this factory cluster).
# Health check gates the service as healthy only after the HTTP API is
# up; initial_status is deliberately unset so Nomad waits for the first
# probe to pass before marking the allocation healthy on boot.
service {
name = "woodpecker"
port = "http"
provider = "nomad"
check {
type = "http"
path = "/healthz"
interval = "10s"
timeout = "3s"
}
}
task "woodpecker-server" {
driver = "docker"
config {
image = "woodpeckerci/woodpecker-server:v3"
ports = ["http", "grpc"]
}
volume_mount {
volume = "woodpecker-data"
destination = "/var/lib/woodpecker"
read_only = false
}
# Non-secret env Forgejo integration flags, public URL, DB driver.
# Nothing sensitive here, so this stays inline. Secret-bearing env
# (agent secret, OAuth client/secret) lives in the template stanza
# below and is merged into task env.
env {
WOODPECKER_FORGEJO = "true"
WOODPECKER_FORGEJO_URL = "http://forgejo:3000"
WOODPECKER_HOST = "http://woodpecker:8000"
WOODPECKER_OPEN = "true"
WOODPECKER_DATABASE_DRIVER = "sqlite3"
WOODPECKER_DATABASE_DATASOURCE = "/var/lib/woodpecker/woodpecker.sqlite"
}
# Vault-templated secrets env (S2.4 pattern)
# Renders `<task-dir>/secrets/wp.env` (per-alloc secrets dir, never on
# disk on the host root filesystem). `env = true` merges every KEY=VAL
# line into the task environment. `change_mode = "restart"` re-runs the
# task whenever a watched secret's value in Vault changes.
#
# Vault path: `kv/data/disinto/shared/woodpecker`. The literal `/data/`
# segment is required by consul-template for KV v2 mounts.
#
# Empty-Vault fallback (`with ... else ...`): on a fresh LXC where
# the KV path is absent, consul-template's `with` short-circuits to
# the `else` branch. Emitting visible placeholders means the container
# still boots, but with obviously-bad secrets. Seed the path with
# tools/vault-seed-woodpecker.sh (agent_secret) and S3.3's
# wp-oauth-register.sh (forgejo_client, forgejo_secret).
#
# Placeholder values are kept short on purpose: the repo-wide
# secret-scan flags `TOKEN=<16+ non-space chars>` as a plaintext
# secret; "seed-me" is < 16 chars and still distinctive.
template {
destination = "secrets/wp.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/shared/woodpecker" -}}
WOODPECKER_AGENT_SECRET={{ .Data.data.agent_secret }}
WOODPECKER_FORGEJO_CLIENT={{ .Data.data.forgejo_client }}
WOODPECKER_FORGEJO_SECRET={{ .Data.data.forgejo_secret }}
{{- else -}}
# WARNING: kv/disinto/shared/woodpecker is empty run tools/vault-seed-woodpecker.sh + S3.3
WOODPECKER_AGENT_SECRET=seed-me
WOODPECKER_FORGEJO_CLIENT=seed-me
WOODPECKER_FORGEJO_SECRET=seed-me
{{- end -}}
EOT
}
resources {
cpu = 300
memory = 512
}
}
}
}

View file

@ -51,26 +51,3 @@ advertise {
ui { ui {
enabled = true enabled = true
} }
# Vault integration (S2.3, issue #881)
# Nomad jobs exchange their short-lived workload-identity JWT (signed by
# nomad's built-in signer at /.well-known/jwks.json on :4646) for a Vault
# token carrying the policies named by the role in `vault { role = "..." }`
# of each jobspec no shared VAULT_TOKEN in job env.
#
# The JWT auth path (jwt-nomad) + per-role bindings live on the Vault
# side, written by lib/init/nomad/vault-nomad-auth.sh + tools/vault-apply-roles.sh.
# Roles are defined in vault/roles.yaml.
#
# `default_identity.aud = ["vault.io"]` matches bound_audiences on every
# role in vault/roles.yaml a drift here would silently break every job's
# Vault token exchange at placement time.
vault {
enabled = true
address = "http://127.0.0.1:8200"
default_identity {
aud = ["vault.io"]
ttl = "1h"
}
}

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# Planner Agent # Planner Agent
**Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints), **Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints),

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# Predictor Agent # Predictor Agent
**Role**: Abstract adversary (the "goblin"). Runs a 2-step formula **Role**: Abstract adversary (the "goblin"). Runs a 2-step formula

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# Review Agent # Review Agent
**Role**: AI-powered PR review — post structured findings and formal **Role**: AI-powered PR review — post structured findings and formal

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 --> <!-- last-reviewed: 2a7ae0b7eae5979b2c53e3bd1c4280dfdc9df785 -->
# Supervisor Agent # Supervisor Agent
**Role**: Health monitoring and auto-remediation, executed as a formula-driven **Role**: Health monitoring and auto-remediation, executed as a formula-driven
@ -24,18 +24,10 @@ Both invoke the same `supervisor-run.sh`. Sources `lib/guard.sh` and calls `chec
files for `PHASE:escalate` entries and auto-removes any whose linked issue files for `PHASE:escalate` entries and auto-removes any whose linked issue
is confirmed closed (24h grace period after closure to avoid races). Reports is confirmed closed (24h grace period after closure to avoid races). Reports
**stale crashed worktrees** (worktrees preserved after crash) — supervisor **stale crashed worktrees** (worktrees preserved after crash) — supervisor
housekeeping removes them after 24h. Collects **Woodpecker agent health** housekeeping removes them after 24h
(added #933): container `disinto-woodpecker-agent` health/running status,
gRPC error count in last 20 min, fast-failure pipeline count (<60s, last 15 min),
and overall health verdict (healthy/unhealthy). Unhealthy verdict triggers
automatic container restart + `blocked:ci_exhausted` issue recovery in
`supervisor-run.sh` before the Claude session starts.
- `formulas/run-supervisor.toml` — Execution spec: five steps (preflight review, - `formulas/run-supervisor.toml` — Execution spec: five steps (preflight review,
health-assessment, decide-actions, report, journal) with `needs` dependencies. health-assessment, decide-actions, report, journal) with `needs` dependencies.
Claude evaluates all metrics and takes actions in a single interactive session. Claude evaluates all metrics and takes actions in a single interactive session
Health-assessment now includes P2 **Woodpecker agent unhealthy** classification
(container not running, ≥3 gRPC errors/20m, or ≥3 fast-failure pipelines/15m);
decide-actions documents the pre-session auto-recovery path
- `$OPS_REPO_ROOT/knowledge/*.md` — Domain-specific remediation guides (memory, - `$OPS_REPO_ROOT/knowledge/*.md` — Domain-specific remediation guides (memory,
disk, CI, git, dev-agent, review-agent, forge) disk, CI, git, dev-agent, review-agent, forge)
@ -55,6 +47,5 @@ P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping).
- Logs a WARNING message at startup indicating degraded mode - Logs a WARNING message at startup indicating degraded mode
**Lifecycle**: supervisor-run.sh (invoked by polling loop every 20min, `check_active supervisor`) **Lifecycle**: supervisor-run.sh (invoked by polling loop every 20min, `check_active supervisor`)
→ lock + memory guard → run preflight.sh (collect metrics) → **WP agent health recovery** → lock + memory guard → run preflight.sh (collect metrics) → load formula + context → run
(if unhealthy: restart container + recover ci_exhausted issues) → load formula + context → run
claude -p via agent-sdk.sh → Claude assesses health, auto-fixes, writes journal → `PHASE:done`. claude -p via agent-sdk.sh → Claude assesses health, auto-fixes, writes journal → `PHASE:done`.

View file

@ -224,108 +224,3 @@ for _vf in "${_va_root}"/*.md; do
done done
[ "$_found_vault" = false ] && echo " None" [ "$_found_vault" = false ] && echo " None"
echo "" echo ""
# ── Woodpecker Agent Health ────────────────────────────────────────────────
echo "## Woodpecker Agent Health"
# Check WP agent container health status
_wp_container="disinto-woodpecker-agent"
_wp_health_status="unknown"
_wp_health_start=""
if command -v docker &>/dev/null; then
# Get health status via docker inspect
_wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Health.Status}}' 2>/dev/null || echo "not_found")
if [ "$_wp_health_status" = "not_found" ] || [ -z "$_wp_health_status" ]; then
# Container may not exist or not have health check configured
_wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Status}}' 2>/dev/null || echo "not_found")
fi
# Get container start time for age calculation
_wp_start_time=$(docker inspect "$_wp_container" --format '{{.State.StartedAt}}' 2>/dev/null || echo "")
if [ -n "$_wp_start_time" ] && [ "$_wp_start_time" != "0001-01-01T00:00:00Z" ]; then
_wp_health_start=$(date -d "$_wp_start_time" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_start_time")
fi
fi
echo "Container: $_wp_container"
echo "Status: $_wp_health_status"
[ -n "$_wp_health_start" ] && echo "Started: $_wp_health_start"
# Check for gRPC errors in agent logs (last 20 minutes)
_wp_grpc_errors=0
if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then
_wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 | grep -c 'grpc error' || echo "0")
echo "gRPC errors (last 20m): $_wp_grpc_errors"
fi
# Fast-failure heuristic: check for pipelines completing in <60s
_wp_fast_failures=0
_wp_recent_failures=""
if [ -n "${WOODPECKER_REPO_ID:-}" ] && [ "${WOODPECKER_REPO_ID}" != "0" ]; then
_now=$(date +%s)
_pipelines=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines?perPage=100" 2>/dev/null || echo '[]')
# Count failures with duration < 60s in last 15 minutes
_wp_fast_failures=$(echo "$_pipelines" | jq --argjson now "$_now" '
[.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
| length' 2>/dev/null || echo "0")
if [ "$_wp_fast_failures" -gt 0 ]; then
_wp_recent_failures=$(echo "$_pipelines" | jq -r --argjson now "$_now" '
[.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
| .[] | "\(.number)\t\((.finished - .started))s"' 2>/dev/null || echo "")
fi
fi
echo "Fast-fail pipelines (<60s, last 15m): $_wp_fast_failures"
if [ -n "$_wp_recent_failures" ] && [ "$_wp_fast_failures" -gt 0 ]; then
echo "Recent failures:"
echo "$_wp_recent_failures" | while IFS=$'\t' read -r _num _dur; do
echo " #$_num: ${_dur}"
done
fi
# Determine overall WP agent health
_wp_agent_healthy=true
_wp_health_reason=""
if [ "$_wp_health_status" = "not_found" ]; then
_wp_agent_healthy=false
_wp_health_reason="Container not running"
elif [ "$_wp_health_status" = "unhealthy" ]; then
_wp_agent_healthy=false
_wp_health_reason="Container health check failed"
elif [ "$_wp_health_status" != "running" ]; then
_wp_agent_healthy=false
_wp_health_reason="Container not in running state: $_wp_health_status"
elif [ "$_wp_grpc_errors" -ge 3 ]; then
_wp_agent_healthy=false
_wp_health_reason="High gRPC error count (>=3 in 20m)"
elif [ "$_wp_fast_failures" -ge 3 ]; then
_wp_agent_healthy=false
_wp_health_reason="High fast-failure count (>=3 in 15m)"
fi
echo ""
echo "WP Agent Health: $([ "$_wp_agent_healthy" = true ] && echo "healthy" || echo "UNHEALTHY")"
[ -n "$_wp_health_reason" ] && echo "Reason: $_wp_health_reason"
echo ""
# ── WP Agent Health History (for idempotency) ──────────────────────────────
echo "## WP Agent Health History"
# Track last restart timestamp to avoid duplicate restarts in same run
_WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
_wp_last_restart="never"
_wp_last_restart_ts=0
if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
_wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" -gt 0 ] 2>/dev/null; then
_wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
fi
fi
echo "Last restart: $_wp_last_restart"
echo ""

View file

@ -47,9 +47,6 @@ SID_FILE="/tmp/supervisor-session-${PROJECT_NAME}.sid"
SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md" SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md"
WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run" WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run"
# WP agent container name (configurable via env var)
export WP_AGENT_CONTAINER_NAME="${WP_AGENT_CONTAINER_NAME:-disinto-woodpecker-agent}"
# Override LOG_AGENT for consistent agent identification # Override LOG_AGENT for consistent agent identification
# shellcheck disable=SC2034 # consumed by agent-sdk.sh and env.sh log() # shellcheck disable=SC2034 # consumed by agent-sdk.sh and env.sh log()
LOG_AGENT="supervisor" LOG_AGENT="supervisor"
@ -169,160 +166,6 @@ ${FORMULA_CONTENT}
${SCRATCH_INSTRUCTION} ${SCRATCH_INSTRUCTION}
${PROMPT_FOOTER}" ${PROMPT_FOOTER}"
# ── WP Agent Health Recovery ──────────────────────────────────────────────
# Check preflight output for WP agent health issues and trigger recovery if needed
_WP_HEALTH_CHECK_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health-check.md"
echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE"
# Extract WP agent health status from preflight output
# Note: match exact "healthy" not "UNHEALTHY" (substring issue)
_wp_agent_healthy=$(grep "^WP Agent Health: healthy$" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null && echo "true" || echo "false")
_wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
log "WP agent detected as UNHEALTHY: $_wp_health_reason"
# Check for idempotency guard - have we already restarted in this run?
_WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
_wp_last_restart_ts=0
_wp_last_restart="never"
if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
_wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" != "0" ] 2>/dev/null; then
_wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
fi
fi
_current_ts=$(date +%s)
_restart_threshold=300 # 5 minutes between restarts
if [ -z "$_wp_last_restart_ts" ] || [ "$_wp_last_restart_ts" = "0" ] || [ $((_current_ts - _wp_last_restart_ts)) -gt $_restart_threshold ]; then
log "Triggering WP agent restart..."
# Restart the WP agent container
if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then
_restart_time=$(date -u '+%Y-%m-%d %H:%M UTC')
log "Successfully restarted WP agent container: $WP_AGENT_CONTAINER_NAME"
# Update history file
echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE"
echo "LAST_RESTART_TIME=$_restart_time" >> "$_WP_HEALTH_HISTORY_FILE"
# Post recovery notice to journal
_journal_file="${OPS_JOURNAL_ROOT}/$(date -u +%Y-%m-%d).md"
if [ -f "$_journal_file" ]; then
{
echo ""
echo "### WP Agent Recovery - $_restart_time"
echo ""
echo "WP agent was unhealthy: $_wp_health_reason"
echo "Container restarted automatically."
} >> "$_journal_file"
fi
# Scan for issues updated in the last 30 minutes with blocked: ci_exhausted label
log "Scanning for ci_exhausted issues updated in last 30 minutes..."
_now_epoch=$(date +%s)
_thirty_min_ago=$(( _now_epoch - 1800 ))
# Fetch open issues with blocked label
_blocked_issues=$(forge_api GET "/issues?state=open&labels=blocked&type=issues&limit=100" 2>/dev/null || echo "[]")
_blocked_count=$(echo "$_blocked_issues" | jq 'length' 2>/dev/null || echo "0")
_issues_processed=0
_issues_recovered=0
if [ "$_blocked_count" -gt 0 ]; then
# Process each blocked issue
echo "$_blocked_issues" | jq -c '.[]' 2>/dev/null | while IFS= read -r issue_json; do
[ -z "$issue_json" ] && continue
_issue_num=$(echo "$issue_json" | jq -r '.number // empty')
_issue_updated=$(echo "$issue_json" | jq -r '.updated_at // empty')
_issue_labels=$(echo "$issue_json" | jq -r '.labels | map(.name) | join(",")' 2>/dev/null || echo "")
# Check if issue has ci_exhausted label
if ! echo "$_issue_labels" | grep -q "ci_exhausted"; then
continue
fi
# Parse updated_at timestamp
_issue_updated_epoch=$(date -d "$_issue_updated" +%s 2>/dev/null || echo "0")
_time_since_update=$(( _now_epoch - _issue_updated_epoch ))
# Check if updated in last 30 minutes
if [ "$_time_since_update" -lt 1800 ] && [ "$_time_since_update" -ge 0 ]; then
_issues_processed=$(( _issues_processed + 1 ))
# Check for idempotency guard - already swept by supervisor?
_issue_body=$(echo "$issue_json" | jq -r '.body // ""' 2>/dev/null || echo "")
if echo "$_issue_body" | grep -q "<!-- supervisor-swept -->"; then
log "Issue #$_issue_num already swept by supervisor, skipping"
continue
fi
log "Processing ci_exhausted issue #$_issue_num (updated $_time_since_update seconds ago)"
# Get issue assignee
_issue_assignee=$(echo "$issue_json" | jq -r '.assignee.login // empty' 2>/dev/null || echo "")
# Unassign the issue
if [ -n "$_issue_assignee" ]; then
log "Unassigning issue #$_issue_num from $_issue_assignee"
curl -sf -X PATCH \
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
-H "Content-Type: application/json" \
"${FORGE_API}/issues/$_issue_num" \
-d '{"assignees":[]}' >/dev/null 2>&1 || true
fi
# Remove blocked label
_blocked_label_id=$(forge_api GET "/labels" 2>/dev/null | jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || echo "")
if [ -n "$_blocked_label_id" ]; then
log "Removing blocked label from issue #$_issue_num"
curl -sf -X DELETE \
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
"${FORGE_API}/issues/$_issue_num/labels/$_blocked_label_id" >/dev/null 2>&1 || true
fi
# Add comment about infra-flake recovery
_recovery_comment=$(cat <<EOF
<!-- supervisor-swept -->
**Automated Recovery — $(date -u '+%Y-%m-%d %H:%M UTC')**
CI agent was unhealthy between $_restart_time and now. The prior retry budget may have been spent on infra flake, not real failures.
**Recovery Actions:**
- Unassigned from pool and returned for fresh attempt
- CI agent container restarted
- Related pipelines will be retriggered automatically
**Next Steps:**
Please re-attempt this issue. The CI environment has been refreshed.
EOF
)
curl -sf -X POST \
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
-H "Content-Type: application/json" \
"${FORGE_API}/issues/$_issue_num/comments" \
-d "$(jq -n --arg body "$_recovery_comment" '{body: $body}')" >/dev/null 2>&1 || true
log "Recovered issue #$_issue_num - returned to pool"
fi
done
fi
log "WP agent restart and issue recovery complete"
else
log "ERROR: Failed to restart WP agent container"
fi
else
log "WP agent restart already performed in this run (since $_wp_last_restart), skipping"
fi
fi
# ── Run agent ───────────────────────────────────────────────────────────── # ── Run agent ─────────────────────────────────────────────────────────────
agent_run --worktree "$WORKTREE" "$PROMPT" agent_run --worktree "$WORKTREE" "$PROMPT"
log "agent_run complete" log "agent_run complete"

View file

@ -34,7 +34,7 @@ setup_file() {
[[ "$output" == *"nomad backend: default (cluster-up; jobs deferred to Step 1)"* ]] [[ "$output" == *"nomad backend: default (cluster-up; jobs deferred to Step 1)"* ]]
# All nine cluster-up dry-run steps, in order. # All nine cluster-up dry-run steps, in order.
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]]
[[ "$output" == *"[dry-run] Step 2/9: write + enable nomad.service (NOT started)"* ]] [[ "$output" == *"[dry-run] Step 2/9: write + enable nomad.service (NOT started)"* ]]
[[ "$output" == *"[dry-run] Step 3/9: write + enable vault.service + vault.hcl (NOT started)"* ]] [[ "$output" == *"[dry-run] Step 3/9: write + enable vault.service + vault.hcl (NOT started)"* ]]
[[ "$output" == *"[dry-run] Step 4/9: create host-volume dirs under /srv/disinto/"* ]] [[ "$output" == *"[dry-run] Step 4/9: create host-volume dirs under /srv/disinto/"* ]]
@ -57,7 +57,7 @@ setup_file() {
# of the migration will branch on $empty to gate job deployment; today # of the migration will branch on $empty to gate job deployment; today
# both modes invoke the same cluster-up dry-run. # both modes invoke the same cluster-up dry-run.
[[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]] [[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]]
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]]
[[ "$output" == *"Dry run complete — no changes made."* ]] [[ "$output" == *"Dry run complete — no changes made."* ]]
} }
@ -69,7 +69,7 @@ setup_file() {
# Negative assertion: the nomad dispatcher banners must be absent. # Negative assertion: the nomad dispatcher banners must be absent.
[[ "$output" != *"nomad backend:"* ]] [[ "$output" != *"nomad backend:"* ]]
[[ "$output" != *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" != *"[dry-run] Step 1/9: install nomad + vault binaries"* ]]
# Positive assertion: docker-path output still appears — the existing # Positive assertion: docker-path output still appears — the existing
# docker dry-run printed "=== disinto init ===" before listing the # docker dry-run printed "=== disinto init ===" before listing the
@ -88,7 +88,7 @@ setup_file() {
run "$DISINTO_BIN" init placeholder/repo --backend nomad --dry-run run "$DISINTO_BIN" init placeholder/repo --backend nomad --dry-run
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
[[ "$output" == *"nomad backend: default"* ]] [[ "$output" == *"nomad backend: default"* ]]
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]]
} }
# ── Flag validation ────────────────────────────────────────────────────────── # ── Flag validation ──────────────────────────────────────────────────────────
@ -118,7 +118,7 @@ setup_file() {
run "$DISINTO_BIN" init --backend=nomad --empty --dry-run run "$DISINTO_BIN" init --backend=nomad --empty --dry-run
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
[[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]] [[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]]
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]]
# The bug symptom must be absent — backend was misdetected as docker # The bug symptom must be absent — backend was misdetected as docker
# when --backend=nomad got swallowed as repo_url. # when --backend=nomad got swallowed as repo_url.
[[ "$output" != *"--empty is only valid with --backend=nomad"* ]] [[ "$output" != *"--empty is only valid with --backend=nomad"* ]]
@ -128,7 +128,7 @@ setup_file() {
run "$DISINTO_BIN" init --backend nomad --dry-run run "$DISINTO_BIN" init --backend nomad --dry-run
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
[[ "$output" == *"nomad backend: default"* ]] [[ "$output" == *"nomad backend: default"* ]]
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]]
} }
@test "disinto init (no args) still errors with 'repo URL required'" { @test "disinto init (no args) still errors with 'repo URL required'" {
@ -155,44 +155,6 @@ setup_file() {
[[ "$output" == *"[deploy] dry-run complete"* ]] [[ "$output" == *"[deploy] dry-run complete"* ]]
} }
# S2.6 / #928 — every --with <svc> that ships tools/vault-seed-<svc>.sh
# must auto-invoke the seeder before deploy.sh runs. forgejo is the
# only service with a seeder today, so the dry-run plan must include
# its seed line when --with forgejo is set. The seed block must also
# appear BEFORE the deploy block (seeded secrets must exist before
# nomad reads the template stanza) — pinned here by scanning output
# order. Services without a seeder (e.g. unknown hypothetical future
# ones) are silently skipped by the loop convention.
@test "disinto init --backend=nomad --with forgejo --dry-run prints seed plan before deploy plan" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"Vault seed dry-run"* ]]
[[ "$output" == *"tools/vault-seed-forgejo.sh --dry-run"* ]]
# Order: seed header must appear before deploy header.
local seed_line deploy_line
seed_line=$(echo "$output" | grep -n "Vault seed dry-run" | head -1 | cut -d: -f1)
deploy_line=$(echo "$output" | grep -n "Deploy services dry-run" | head -1 | cut -d: -f1)
[ -n "$seed_line" ]
[ -n "$deploy_line" ]
[ "$seed_line" -lt "$deploy_line" ]
}
# Regression guard (PR #929 review): `sudo -n VAR=val -- cmd` is subject
# to sudoers env_reset policy and silently drops VAULT_ADDR unless it's
# in env_keep (it isn't in default configs). vault-seed-forgejo.sh
# requires VAULT_ADDR and dies at its own precondition check if unset,
# so the non-root branch MUST invoke `sudo -n -- env VAR=val cmd` so
# that `env` sets the variable in the child process regardless of
# sudoers policy. This grep-level guard catches a revert to the unsafe
# form that silently broke non-root seed runs on a fresh LXC.
@test "seed loop invokes sudo via 'env VAR=val' (bypasses sudoers env_reset)" {
run grep -F 'sudo -n -- env "VAULT_ADDR=' "$DISINTO_BIN"
[ "$status" -eq 0 ]
# Negative: no bare `sudo -n "VAR=val" --` form anywhere in the file.
run grep -F 'sudo -n "VAULT_ADDR=' "$DISINTO_BIN"
[ "$status" -ne 0 ]
}
@test "disinto init --backend=nomad --with forgejo,forgejo --dry-run handles comma-separated services" { @test "disinto init --backend=nomad --with forgejo,forgejo --dry-run handles comma-separated services" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,forgejo --dry-run run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,forgejo --dry-run
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
@ -215,44 +177,7 @@ setup_file() {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with unknown-service --dry-run run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with unknown-service --dry-run
[ "$status" -ne 0 ] [ "$status" -ne 0 ]
[[ "$output" == *"unknown service"* ]] [[ "$output" == *"unknown service"* ]]
[[ "$output" == *"known: forgejo, woodpecker-server, woodpecker-agent"* ]] [[ "$output" == *"known: forgejo"* ]]
}
# S3.4: woodpecker auto-expansion and forgejo auto-inclusion
@test "disinto init --backend=nomad --with woodpecker auto-expands to server+agent" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent"* ]]
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent"* ]]
}
@test "disinto init --backend=nomad --with woodpecker auto-includes forgejo with note" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"Note: --with woodpecker implies --with forgejo"* ]]
}
@test "disinto init --backend=nomad --with forgejo,woodpecker expands woodpecker" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,woodpecker --dry-run
[ "$status" -eq 0 ]
# Order follows input: forgejo first, then woodpecker expanded
[[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent"* ]]
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent"* ]]
}
@test "disinto init --backend=nomad --with woodpecker seeds both forgejo and woodpecker" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"tools/vault-seed-forgejo.sh --dry-run"* ]]
[[ "$output" == *"tools/vault-seed-woodpecker.sh --dry-run"* ]]
}
@test "disinto init --backend=nomad --with forgejo,woodpecker deploys all three services" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,woodpecker --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"forgejo.hcl"* ]]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-server.hcl"* ]]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-agent.hcl"* ]]
} }
@test "disinto init --backend=nomad --with forgejo (flag=value syntax) works" { @test "disinto init --backend=nomad --with forgejo (flag=value syntax) works" {
@ -266,122 +191,3 @@ setup_file() {
[ "$status" -ne 0 ] [ "$status" -ne 0 ]
[[ "$output" == *"--empty and --with are mutually exclusive"* ]] [[ "$output" == *"--empty and --with are mutually exclusive"* ]]
} }
# ── --import-env / --import-sops / --age-key (S2.5, #883) ────────────────────
#
# Step 2.5 wires Vault policies + JWT auth + optional KV import into
# `disinto init --backend=nomad`. The tests below exercise the flag
# grammar (who-requires-whom + who-requires-backend=nomad) and the
# dry-run plan shape (each --import-* flag prints its own path line,
# independently). A prior attempt at this issue regressed the "print
# every set flag" invariant by using if/elif — covered by the
# "--import-env --import-sops --age-key" case.
@test "disinto init --backend=nomad --import-env only is accepted" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env /tmp/.env --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"--import-env"* ]]
[[ "$output" == *"env file: /tmp/.env"* ]]
}
@test "disinto init --backend=nomad --import-sops without --age-key errors" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-sops /tmp/.env.vault.enc --dry-run
[ "$status" -ne 0 ]
[[ "$output" == *"--import-sops requires --age-key"* ]]
}
@test "disinto init --backend=nomad --age-key without --import-sops errors" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --age-key /tmp/keys.txt --dry-run
[ "$status" -ne 0 ]
[[ "$output" == *"--age-key requires --import-sops"* ]]
}
@test "disinto init --backend=docker --import-env errors with backend requirement" {
run "$DISINTO_BIN" init placeholder/repo --backend=docker --import-env /tmp/.env
[ "$status" -ne 0 ]
[[ "$output" == *"--import-env, --import-sops, and --age-key require --backend=nomad"* ]]
}
@test "disinto init --backend=nomad --import-sops --age-key --dry-run shows import plan" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"Vault import dry-run"* ]]
[[ "$output" == *"--import-sops"* ]]
[[ "$output" == *"--age-key"* ]]
[[ "$output" == *"sops file: /tmp/.env.vault.enc"* ]]
[[ "$output" == *"age key: /tmp/keys.txt"* ]]
}
# When all three flags are set, each one must print its own path line —
# if/elif regressed this to "only one printed" in a prior attempt (#883).
@test "disinto init --backend=nomad --import-env --import-sops --age-key --dry-run shows full import plan" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env /tmp/.env --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"Vault import dry-run"* ]]
[[ "$output" == *"env file: /tmp/.env"* ]]
[[ "$output" == *"sops file: /tmp/.env.vault.enc"* ]]
[[ "$output" == *"age key: /tmp/keys.txt"* ]]
}
@test "disinto init --backend=nomad without import flags shows skip message" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"no --import-env/--import-sops"* ]]
[[ "$output" == *"skipping"* ]]
}
@test "disinto init --backend=nomad --import-env --import-sops --age-key --with forgejo --dry-run shows all plans" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env /tmp/.env --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --with forgejo --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"Vault import dry-run"* ]]
[[ "$output" == *"Vault policies dry-run"* ]]
[[ "$output" == *"Vault auth dry-run"* ]]
[[ "$output" == *"Deploy services dry-run"* ]]
}
@test "disinto init --backend=nomad --dry-run prints policies + auth plan even without --import-*" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --dry-run
[ "$status" -eq 0 ]
# Policies + auth run on every nomad path (idempotent), so the dry-run
# plan always lists them — regardless of whether --import-* is set.
[[ "$output" == *"Vault policies dry-run"* ]]
[[ "$output" == *"Vault auth dry-run"* ]]
[[ "$output" != *"Vault import dry-run"* ]]
}
# --import-env=PATH (=-form) must work alongside --import-env PATH.
@test "disinto init --backend=nomad --import-env=PATH (equals form) works" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env=/tmp/.env --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"env file: /tmp/.env"* ]]
}
# --empty short-circuits after cluster-up: no policies, no auth, no
# import, no deploy. The dry-run plan must match that — cluster-up plan
# appears, but none of the S2.x section banners do.
@test "disinto init --backend=nomad --empty --dry-run skips policies/auth/import sections" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --dry-run
[ "$status" -eq 0 ]
# Cluster-up still runs (it's what --empty brings up).
[[ "$output" == *"Cluster-up dry-run"* ]]
# Policies + auth + import must NOT appear under --empty.
[[ "$output" != *"Vault policies dry-run"* ]]
[[ "$output" != *"Vault auth dry-run"* ]]
[[ "$output" != *"Vault import dry-run"* ]]
[[ "$output" != *"no --import-env/--import-sops"* ]]
}
# --empty + any --import-* flag silently does nothing (import is skipped),
# so the CLI rejects the combination up front rather than letting it
# look like the import "succeeded".
@test "disinto init --backend=nomad --empty --import-env errors" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --import-env /tmp/.env --dry-run
[ "$status" -ne 0 ]
[[ "$output" == *"--empty and --import-env/--import-sops/--age-key are mutually exclusive"* ]]
}
@test "disinto init --backend=nomad --empty --import-sops --age-key errors" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --dry-run
[ "$status" -ne 0 ]
[[ "$output" == *"--empty and --import-env/--import-sops/--age-key are mutually exclusive"* ]]
}

View file

@ -1,20 +0,0 @@
{
"data": "ENC[AES256_GCM,data:SsLdIiZDVkkV1bbKeHQ8A1K/4vgXQFJF8y4J87GGwsGa13lNnPoqRaCmPAtuQr3hR5JNqARUhFp8aEusyzwi/lZLU2Reo32YjE26ObVOHf47EGmmHM/tEgh6u0fa1AmFtuqJVQzhG2eZhJmZJFgdRH36+bhdBwI1mkORmsRNtBPHHjtQJDbsgN47maDhuP4B7WvB4/TdnJ++GNMlMbyrbr0pEf2uqqOVO55cJ3I4v/Jcg8tq0clPuW1k5dNFsmFSMbbjE5N25EGrc7oEH5GVZ6I6L6p0Fzyj/MV4hKacboFHiZmBZgRQ,iv:UnXTa800G3PW4IaErkPBIZKjPHAU3LmiCvAqDdhFE/Q=,tag:kdWpHQ8fEPGFlmfVoTMskA==,type:str]",
"sops": {
"kms": null,
"gcp_kms": null,
"azure_kv": null,
"hc_vault": null,
"age": [
{
"recipient": "age1ztkm8yvdk42m2cn4dj2v9ptfknq8wpgr3ry9dpmtmlaeas6p7yyqft0ldg",
"enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBrVUlmaEdTNU1iMGg4dFA4\nNFNOSzlBc1NER1U3SHlwVFU1dm5tR1kyeldzCjZ2NXI3MjR4Zkd1RVBKNzJoQ1Jm\nQWpEZU5VMkNuYnhTTVJNc0RpTXlIZE0KLS0tIDFpQ2tlN0MzL1NuS2hKZU5JTG9B\nNWxXMzE0bGZpQkVBTnhWRXZBQlhrc1EKG76DM98cCuqIwUkbfJWHhJdYV77O9r8Q\nRJrq6jH59Gcp9W8iHg/aeShPHZFEOLg1q9azV9Wt9FjJn3SxyTmgvA==\n-----END AGE ENCRYPTED FILE-----\n"
}
],
"lastmodified": "2026-04-16T15:43:34Z",
"mac": "ENC[AES256_GCM,data:jVRr2TxSZH2paD2doIX4JwCqo5wiPYfTowpj189w1IVlS0EY/XQoqxiWbunX/LmIDdQlTPCSe/vTp1EJA0cx6vzN2xENrwsfzCP6dwDGaRlZhH3V0CVhtfHIkMTEKWrAUx5hFtiwJPkLYUUYi5aRWRxhZQM1eBeRvuGKdlwvmHA=,iv:H57a61AfVNLrlg+4aMl9mwXI5O38O5ZoRhpxe2PTTkY=,tag:2jwH1855VNYlKseTE/XtTg==,type:str]",
"pgp": null,
"unencrypted_suffix": "_unencrypted",
"version": "3.9.4"
}
}

View file

@ -1,5 +0,0 @@
# Test age key for sops
# Generated: 2026-04-16
# Public key: age1ztkm8yvdk42m2cn4dj2v9ptfknq8wpgr3ry9dpmtmlaeas6p7yyqft0ldg
AGE-SECRET-KEY-1PCQQX37MTZDGES76H9TGQN5XTG2ZZX2UUR87KR784NZ4MQ3NJ56S0Z23SF

View file

@ -1,40 +0,0 @@
# Test fixture .env file for vault-import.sh
# This file contains all expected keys for the import test
# Generic forge creds
FORGE_TOKEN=generic-forge-token
FORGE_PASS=generic-forge-pass
FORGE_ADMIN_TOKEN=generic-admin-token
# Bot tokens (review, dev, gardener, architect, planner, predictor, supervisor, vault)
FORGE_REVIEW_TOKEN=review-token
FORGE_REVIEW_PASS=review-pass
FORGE_DEV_TOKEN=dev-token
FORGE_DEV_PASS=dev-pass
FORGE_GARDENER_TOKEN=gardener-token
FORGE_GARDENER_PASS=gardener-pass
FORGE_ARCHITECT_TOKEN=architect-token
FORGE_ARCHITECT_PASS=architect-pass
FORGE_PLANNER_TOKEN=planner-token
FORGE_PLANNER_PASS=planner-pass
FORGE_PREDICTOR_TOKEN=predictor-token
FORGE_PREDICTOR_PASS=predictor-pass
FORGE_SUPERVISOR_TOKEN=supervisor-token
FORGE_SUPERVISOR_PASS=supervisor-pass
FORGE_VAULT_TOKEN=vault-token
FORGE_VAULT_PASS=vault-pass
# Llama bot
FORGE_TOKEN_LLAMA=llama-token
FORGE_PASS_LLAMA=llama-pass
# Woodpecker secrets
WOODPECKER_AGENT_SECRET=wp-agent-secret
WP_FORGEJO_CLIENT=wp-forgejo-client
WP_FORGEJO_SECRET=wp-forgejo-secret
WOODPECKER_TOKEN=wp-token
# Chat secrets
FORWARD_AUTH_SECRET=forward-auth-secret
CHAT_OAUTH_CLIENT_ID=chat-client-id
CHAT_OAUTH_CLIENT_SECRET=chat-client-secret

View file

@ -1,27 +0,0 @@
# Test fixture .env file with missing required keys
# This file is intentionally missing some keys to test error handling
# Generic forge creds - missing FORGE_ADMIN_TOKEN
FORGE_TOKEN=generic-forge-token
FORGE_PASS=generic-forge-pass
# Bot tokens - missing several roles
FORGE_REVIEW_TOKEN=review-token
FORGE_REVIEW_PASS=review-pass
FORGE_DEV_TOKEN=dev-token
FORGE_DEV_PASS=dev-pass
# Llama bot - missing (only token, no pass)
FORGE_TOKEN_LLAMA=llama-token
# FORGE_PASS_LLAMA=llama-pass
# Woodpecker secrets - missing some
WOODPECKER_AGENT_SECRET=wp-agent-secret
# WP_FORGEJO_CLIENT=wp-forgejo-client
# WP_FORGEJO_SECRET=wp-forgejo-secret
# WOODPECKER_TOKEN=wp-token
# Chat secrets - missing some
FORWARD_AUTH_SECRET=forward-auth-secret
# CHAT_OAUTH_CLIENT_ID=chat-client-id
# CHAT_OAUTH_CLIENT_SECRET=chat-client-secret

View file

@ -1,6 +0,0 @@
GITHUB_TOKEN=github-test-token-abc123
CODEBERG_TOKEN=codeberg-test-token-def456
CLAWHUB_TOKEN=clawhub-test-token-ghi789
DEPLOY_KEY=deploy-key-test-jkl012
NPM_TOKEN=npm-test-token-mno345
DOCKER_HUB_TOKEN=dockerhub-test-token-pqr678

View file

@ -62,73 +62,6 @@ EOF
[[ "$output" != *'FORGE_BOT_USER_LLAMA'* ]] [[ "$output" != *'FORGE_BOT_USER_LLAMA'* ]]
} }
@test "local-model agent service emits local image ref + build: fallback (#853)" {
# Before #853 the generator emitted `image: ghcr.io/disinto/agents:<tag>` for
# every hired agent. The ghcr image isn't publicly pullable and the running
# deployment has no credentials, so `docker compose up` failed with `denied`.
# The fix: emit the registry-less local name (matches `disinto init --build`
# and the legacy agents-llama stanza) plus a build: directive so hosts
# without a pre-built image can rebuild locally.
cat > "${FACTORY_ROOT}/projects/test.toml" <<'EOF'
name = "test"
repo = "test-owner/test-repo"
forge_url = "http://localhost:3000"
[agents.dev-qwen2]
base_url = "http://10.10.10.1:8081"
model = "qwen"
api_key = "sk-no-key-required"
roles = ["dev"]
forge_user = "dev-qwen2"
EOF
run bash -c "
set -euo pipefail
source '${ROOT}/lib/generators.sh'
_generate_local_model_services '${FACTORY_ROOT}/docker-compose.yml'
cat '${FACTORY_ROOT}/docker-compose.yml'
"
[ "$status" -eq 0 ]
# Local image ref — no ghcr prefix.
[[ "$output" == *'image: disinto/agents:${DISINTO_IMAGE_TAG:-latest}'* ]]
[[ "$output" != *'image: ghcr.io/disinto/agents'* ]]
# build: fallback so hosts without a pre-built image can rebuild.
[[ "$output" == *'dockerfile: docker/agents/Dockerfile'* ]]
}
@test "local-model agent service emits pull_policy: build so docker compose up rebuilds on source change (#887)" {
# Without pull_policy: build, `docker compose up -d --force-recreate` reuses
# the cached `disinto/agents:latest` image and silently runs stale
# docker/agents/entrypoint.sh even after the repo is updated. `pull_policy:
# build` forces a rebuild on every up; BuildKit layer cache makes unchanged
# rebuilds near-instant. The alternative was requiring every operator to
# remember `--build` on every invocation, which was the bug that prompted
# #887 (2h of debugging a fix that was merged but never reached the container).
cat > "${FACTORY_ROOT}/projects/test.toml" <<'EOF'
name = "test"
repo = "test-owner/test-repo"
forge_url = "http://localhost:3000"
[agents.dev-qwen2]
base_url = "http://10.10.10.1:8081"
model = "qwen"
api_key = "sk-no-key-required"
roles = ["dev"]
forge_user = "dev-qwen2"
EOF
run bash -c "
set -euo pipefail
source '${ROOT}/lib/generators.sh'
_generate_local_model_services '${FACTORY_ROOT}/docker-compose.yml'
cat '${FACTORY_ROOT}/docker-compose.yml'
"
[ "$status" -eq 0 ]
[[ "$output" == *'pull_policy: build'* ]]
}
@test "local-model agent service keys FORGE_BOT_USER to forge_user even when it differs from service name (#849)" { @test "local-model agent service keys FORGE_BOT_USER to forge_user even when it differs from service name (#849)" {
# Exercise the case the issue calls out: two agents in the same factory # Exercise the case the issue calls out: two agents in the same factory
# whose service names are identical (`[agents.llama]`) but whose # whose service names are identical (`[agents.llama]`) but whose

View file

@ -126,7 +126,7 @@ setup() {
@test "hvault_policy_apply creates a policy" { @test "hvault_policy_apply creates a policy" {
local pfile="${BATS_TEST_TMPDIR}/test-policy.hcl" local pfile="${BATS_TEST_TMPDIR}/test-policy.hcl"
cat > "$pfile" <<'HCL' cat > "$pfile" <<'HCL'
path "kv/data/test/*" { path "secret/data/test/*" {
capabilities = ["read"] capabilities = ["read"]
} }
HCL HCL
@ -138,12 +138,12 @@ HCL
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/sys/policies/acl/test-reader" "${VAULT_ADDR}/v1/sys/policies/acl/test-reader"
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
echo "$output" | jq -e '.data.policy' | grep -q "kv/data/test" echo "$output" | jq -e '.data.policy' | grep -q "secret/data/test"
} }
@test "hvault_policy_apply is idempotent" { @test "hvault_policy_apply is idempotent" {
local pfile="${BATS_TEST_TMPDIR}/idem-policy.hcl" local pfile="${BATS_TEST_TMPDIR}/idem-policy.hcl"
printf 'path "kv/*" { capabilities = ["list"] }\n' > "$pfile" printf 'path "secret/*" { capabilities = ["list"] }\n' > "$pfile"
run hvault_policy_apply "idem-policy" "$pfile" run hvault_policy_apply "idem-policy" "$pfile"
[ "$status" -eq 0 ] [ "$status" -eq 0 ]

View file

@ -423,6 +423,49 @@ export CLAUDE_SHARED_DIR="$ORIG_CLAUDE_SHARED_DIR"
export CLAUDE_CONFIG_DIR="$ORIG_CLAUDE_CONFIG_DIR" export CLAUDE_CONFIG_DIR="$ORIG_CLAUDE_CONFIG_DIR"
rm -rf /tmp/smoke-claude-shared /tmp/smoke-home-claude rm -rf /tmp/smoke-claude-shared /tmp/smoke-home-claude
# ── 8. Test duplicate service name detection ──────────────────────────────
echo "=== 8/8 Testing duplicate service name detection ==="
# Clean up for duplicate test
rm -f "${FACTORY_ROOT}/projects/duplicate-test.toml"
# Create a TOML that would conflict with ENABLE_LLAMA_AGENT
cat > "${FACTORY_ROOT}/projects/duplicate-test.toml" <<'TOMLEOF'
name = "duplicate-test"
description = "Test project for duplicate service detection"
[ci]
woodpecker_repo_id = "999"
[agents.llama]
base_url = "http://localhost:8080"
model = "qwen:latest"
roles = ["dev"]
forge_user = "llama-bot"
TOMLEOF
# Run disinto init with ENABLE_LLAMA_AGENT=1
# This should fail because [agents.llama] conflicts with ENABLE_LLAMA_AGENT
export ENABLE_LLAMA_AGENT="1"
export FORGE_URL="http://localhost:3000"
export SMOKE_FORGE_URL="$FORGE_URL"
export FORGE_ADMIN_PASS="smoke-test-password-123"
export SKIP_PUSH=true
if bash "${FACTORY_ROOT}/bin/disinto" init \
"duplicate-test" \
--bare --yes \
--forge-url "$FORGE_URL" \
--repo-root "/tmp/smoke-test-repo" 2>&1 | grep -q "Duplicate service name 'agents-llama'"; then
pass "Duplicate service detection: correctly detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]"
else
fail "Duplicate service detection: should have detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]"
fi
# Clean up
rm -f "${FACTORY_ROOT}/projects/duplicate-test.toml"
unset ENABLE_LLAMA_AGENT
# ── Summary ────────────────────────────────────────────────────────────────── # ── Summary ──────────────────────────────────────────────────────────────────
echo "" echo ""
if [ "$FAILED" -ne 0 ]; then if [ "$FAILED" -ne 0 ]; then

View file

@ -0,0 +1,210 @@
#!/usr/bin/env bash
# tests/test-duplicate-service-detection.sh — Unit test for duplicate service detection
#
# Tests that the compose generator correctly detects duplicate service names
# between ENABLE_LLAMA_AGENT=1 and [agents.llama] TOML configuration.
set -euo pipefail
# Get the absolute path to the disinto root
DISINTO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
TEST_DIR=$(mktemp -d)
trap "rm -rf \"\$TEST_DIR\"" EXIT
FAILED=0
fail() { printf 'FAIL: %s\n' "$*" >&2; FAILED=1; }
pass() { printf 'PASS: %s\n' "$*"; }
# Test 1: Duplicate between ENABLE_LLAMA_AGENT and [agents.llama]
echo "=== Test 1: Duplicate between ENABLE_LLAMA_AGENT and [agents.llama] ==="
# Create projects directory and test project TOML with an agent named "llama"
mkdir -p "${TEST_DIR}/projects"
cat > "${TEST_DIR}/projects/test-project.toml" <<'TOMLEOF'
name = "test-project"
description = "Test project for duplicate detection"
[ci]
woodpecker_repo_id = "123"
[agents.llama]
base_url = "http://localhost:8080"
model = "qwen:latest"
roles = ["dev"]
forge_user = "llama-bot"
TOMLEOF
# Create a minimal compose file
cat > "${TEST_DIR}/docker-compose.yml" <<'COMPOSEEOF'
# Test compose file
services:
agents:
image: test:latest
command: echo "hello"
volumes:
test-data:
networks:
test-net:
COMPOSEEOF
# Set up the test environment
export FACTORY_ROOT="${TEST_DIR}"
export PROJECT_NAME="test-project"
export ENABLE_LLAMA_AGENT="1"
export FORGE_TOKEN=""
export FORGE_PASS=""
export CLAUDE_TIMEOUT="7200"
export POLL_INTERVAL="300"
export GARDENER_INTERVAL="21600"
export ARCHITECT_INTERVAL="21600"
export PLANNER_INTERVAL="43200"
export SUPERVISOR_INTERVAL="1200"
# Source the generators module and run the compose generator directly
source "${DISINTO_ROOT}/lib/generators.sh"
# Delete the compose file to force regeneration
rm -f "${TEST_DIR}/docker-compose.yml"
# Run the compose generator directly
if _generate_compose_impl 3000 false 2>&1 | tee "${TEST_DIR}/output.txt"; then
# Check if the output contains the duplicate error message
if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output.txt"; then
pass "Duplicate detection: correctly detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]"
else
fail "Duplicate detection: should have detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]"
cat "${TEST_DIR}/output.txt" >&2
fi
else
# Generator should fail with non-zero exit code
if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output.txt"; then
pass "Duplicate detection: correctly detected conflict and returned non-zero exit code"
else
fail "Duplicate detection: should have failed with duplicate error"
cat "${TEST_DIR}/output.txt" >&2
fi
fi
# Test 2: No duplicate when only ENABLE_LLAMA_AGENT is set (no conflicting TOML)
echo ""
echo "=== Test 2: No duplicate when only ENABLE_LLAMA_AGENT is set ==="
# Remove the projects directory created in Test 1
rm -rf "${TEST_DIR}/projects"
# Create a fresh compose file
cat > "${TEST_DIR}/docker-compose.yml" <<'COMPOSEEOF'
# Test compose file
services:
agents:
image: test:latest
volumes:
test-data:
networks:
test-net:
COMPOSEEOF
# Set ENABLE_LLAMA_AGENT
export ENABLE_LLAMA_AGENT="1"
# Delete the compose file to force regeneration
rm -f "${TEST_DIR}/docker-compose.yml"
if _generate_compose_impl 3000 false 2>&1 | tee "${TEST_DIR}/output2.txt"; then
if grep -q "Duplicate" "${TEST_DIR}/output2.txt"; then
fail "No duplicate: should not detect duplicate when only ENABLE_LLAMA_AGENT is set"
else
pass "No duplicate: correctly generated compose without duplicates"
fi
else
# Non-zero exit is fine if there's a legitimate reason (e.g., missing files)
if grep -q "Duplicate" "${TEST_DIR}/output2.txt"; then
fail "No duplicate: should not detect duplicate when only ENABLE_LLAMA_AGENT is set"
else
pass "No duplicate: generator failed for other reason (acceptable)"
fi
fi
# Test 3: Duplicate between two TOML agents with same name
echo ""
echo "=== Test 3: Duplicate between two TOML agents with same name ==="
rm -f "${TEST_DIR}/docker-compose.yml"
# Create projects directory for Test 3
mkdir -p "${TEST_DIR}/projects"
cat > "${TEST_DIR}/projects/project1.toml" <<'TOMLEOF'
name = "project1"
description = "First project"
[ci]
woodpecker_repo_id = "1"
[agents.llama]
base_url = "http://localhost:8080"
model = "qwen:latest"
roles = ["dev"]
forge_user = "llama-bot1"
TOMLEOF
cat > "${TEST_DIR}/projects/project2.toml" <<'TOMLEOF'
name = "project2"
description = "Second project"
[ci]
woodpecker_repo_id = "2"
[agents.llama]
base_url = "http://localhost:8080"
model = "qwen:latest"
roles = ["dev"]
forge_user = "llama-bot2"
TOMLEOF
cat > "${TEST_DIR}/docker-compose.yml" <<'COMPOSEEOF'
# Test compose file
services:
agents:
image: test:latest
volumes:
test-data:
networks:
test-net:
COMPOSEEOF
unset ENABLE_LLAMA_AGENT
# Delete the compose file to force regeneration
rm -f "${TEST_DIR}/docker-compose.yml"
if _generate_compose_impl 3000 false 2>&1 | tee "${TEST_DIR}/output3.txt"; then
if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output3.txt"; then
pass "Duplicate detection: correctly detected conflict between two [agents.llama] blocks"
else
fail "Duplicate detection: should have detected conflict between two [agents.llama] blocks"
cat "${TEST_DIR}/output3.txt" >&2
fi
else
if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output3.txt"; then
pass "Duplicate detection: correctly detected conflict and returned non-zero exit code"
else
fail "Duplicate detection: should have failed with duplicate error"
cat "${TEST_DIR}/output3.txt" >&2
fi
fi
# Summary
echo ""
if [ "$FAILED" -ne 0 ]; then
echo "=== TESTS FAILED ==="
exit 1
fi
echo "=== ALL TESTS PASSED ==="

View file

@ -1,360 +0,0 @@
#!/usr/bin/env bats
# tests/vault-import.bats — Tests for tools/vault-import.sh
#
# Runs against a dev-mode Vault server (single binary, no LXC needed).
# CI launches vault server -dev inline before running these tests.
VAULT_BIN="${VAULT_BIN:-vault}"
IMPORT_SCRIPT="${BATS_TEST_DIRNAME}/../tools/vault-import.sh"
FIXTURES_DIR="${BATS_TEST_DIRNAME}/fixtures"
setup_file() {
# Start dev-mode vault on a random port
export VAULT_DEV_PORT
VAULT_DEV_PORT="$(shuf -i 18200-18299 -n 1)"
export VAULT_ADDR="http://127.0.0.1:${VAULT_DEV_PORT}"
"$VAULT_BIN" server -dev \
-dev-listen-address="127.0.0.1:${VAULT_DEV_PORT}" \
-dev-root-token-id="test-root-token" \
-dev-no-store-token \
&>"${BATS_FILE_TMPDIR}/vault.log" &
export VAULT_PID=$!
export VAULT_TOKEN="test-root-token"
# Wait for vault to be ready (up to 10s)
local i=0
while ! curl -sf "${VAULT_ADDR}/v1/sys/health" >/dev/null 2>&1; do
sleep 0.5
i=$((i + 1))
if [ "$i" -ge 20 ]; then
echo "Vault failed to start. Log:" >&2
cat "${BATS_FILE_TMPDIR}/vault.log" >&2
return 1
fi
done
# Enable kv-v2 at path=kv (production mount per S2 migration). Dev-mode
# vault only auto-mounts kv-v2 at secret/; tests must mirror the real
# cluster layout so vault-import.sh writes land where we read them.
curl -sf -H "X-Vault-Token: test-root-token" \
-X POST -d '{"type":"kv","options":{"version":"2"}}' \
"${VAULT_ADDR}/v1/sys/mounts/kv" >/dev/null
}
teardown_file() {
if [ -n "${VAULT_PID:-}" ]; then
kill "$VAULT_PID" 2>/dev/null || true
wait "$VAULT_PID" 2>/dev/null || true
fi
}
setup() {
# Source the module under test for hvault functions
source "${BATS_TEST_DIRNAME}/../lib/hvault.sh"
export VAULT_ADDR VAULT_TOKEN
}
# --- Security checks ---
@test "refuses to run if VAULT_ADDR is not localhost" {
export VAULT_ADDR="http://prod-vault.example.com:8200"
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -ne 0 ]
echo "$output" | grep -q "Security check failed"
}
@test "refuses if age key file permissions are not 0400" {
# Create a temp file with wrong permissions
local bad_key="${BATS_TEST_TMPDIR}/bad-ages.txt"
echo "AGE-SECRET-KEY-1TEST" > "$bad_key"
chmod 644 "$bad_key"
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$bad_key"
[ "$status" -ne 0 ]
echo "$output" | grep -q "permissions"
}
# --- Dry-run mode ─────────────────────────────────────────────────────────────
@test "--dry-run prints plan without writing to Vault" {
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt" \
--dry-run
[ "$status" -eq 0 ]
echo "$output" | grep -q "DRY-RUN"
echo "$output" | grep -q "Import plan"
echo "$output" | grep -q "Planned operations"
# Verify nothing was written to Vault
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/review"
[ "$status" -ne 0 ]
}
# --- Complete fixture import ─────────────────────────────────────────────────
@test "imports all keys from complete fixture" {
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -eq 0 ]
# Check bots/review
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/review"
[ "$status" -eq 0 ]
echo "$output" | grep -q "review-token"
echo "$output" | grep -q "review-pass"
# Check bots/dev-qwen
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/dev-qwen"
[ "$status" -eq 0 ]
echo "$output" | grep -q "llama-token"
echo "$output" | grep -q "llama-pass"
# Check forge
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/shared/forge"
[ "$status" -eq 0 ]
echo "$output" | grep -q "generic-forge-token"
echo "$output" | grep -q "generic-forge-pass"
echo "$output" | grep -q "generic-admin-token"
# Check woodpecker
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/shared/woodpecker"
[ "$status" -eq 0 ]
echo "$output" | grep -q "wp-agent-secret"
echo "$output" | grep -q "wp-forgejo-client"
echo "$output" | grep -q "wp-forgejo-secret"
echo "$output" | grep -q "wp-token"
# Check chat
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/shared/chat"
[ "$status" -eq 0 ]
echo "$output" | grep -q "forward-auth-secret"
echo "$output" | grep -q "chat-client-id"
echo "$output" | grep -q "chat-client-secret"
# Check runner tokens from sops
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/runner/GITHUB_TOKEN"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.data.data.value == "github-test-token-abc123"'
}
# --- Idempotency ──────────────────────────────────────────────────────────────
@test "re-run with unchanged fixtures reports all unchanged" {
# First run
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -eq 0 ]
# Second run - should report unchanged
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -eq 0 ]
# Check that all keys report unchanged
echo "$output" | grep -q "unchanged"
# Count unchanged occurrences (should be many)
local unchanged_count
unchanged_count=$(echo "$output" | grep -c "unchanged" || true)
[ "$unchanged_count" -gt 10 ]
}
@test "re-run with modified value reports only that key as updated" {
# Create a modified fixture
local modified_env="${BATS_TEST_TMPDIR}/dot-env-modified"
cp "$FIXTURES_DIR/dot-env-complete" "$modified_env"
# Modify one value
sed -i 's/llama-token/MODIFIED-LLAMA-TOKEN/' "$modified_env"
# Run with modified fixture
run "$IMPORT_SCRIPT" \
--env "$modified_env" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -eq 0 ]
# Check that dev-qwen token was updated
echo "$output" | grep -q "dev-qwen.*updated"
# Verify the new value was written (path is disinto/bots/dev-qwen, key is token)
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/dev-qwen"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.data.data.token == "MODIFIED-LLAMA-TOKEN"'
}
# --- Delimiter-in-value regression (#898) ────────────────────────────────────
@test "preserves secret values that contain a pipe character" {
# Regression: previous accumulator packed values into "value|status" and
# joined per-path kv pairs with '|', so any value containing '|' was
# silently truncated or misrouted.
local piped_env="${BATS_TEST_TMPDIR}/dot-env-piped"
cp "$FIXTURES_DIR/dot-env-complete" "$piped_env"
# Swap in values that contain the old delimiter. Exercise both:
# - a paired bot path (token + pass on same vault path, hitting the
# per-path kv-pair join)
# - a single-key path (admin token)
# Values are single-quoted so they survive `source` of the .env file;
# `|` is a shell metachar and unquoted would start a pipeline. That is
# orthogonal to the accumulator bug under test — users are expected to
# quote such values in .env, and the accumulator must then preserve them.
sed -i "s#^FORGE_REVIEW_TOKEN=.*#FORGE_REVIEW_TOKEN='abc|xyz'#" "$piped_env"
sed -i "s#^FORGE_REVIEW_PASS=.*#FORGE_REVIEW_PASS='p1|p2|p3'#" "$piped_env"
sed -i "s#^FORGE_ADMIN_TOKEN=.*#FORGE_ADMIN_TOKEN='admin|with|pipes'#" "$piped_env"
run "$IMPORT_SCRIPT" \
--env "$piped_env" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -eq 0 ]
# Verify each value round-trips intact.
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/review"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.data.data.token == "abc|xyz"'
echo "$output" | jq -e '.data.data.pass == "p1|p2|p3"'
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/shared/forge"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.data.data.admin_token == "admin|with|pipes"'
}
# --- Incomplete fixture ───────────────────────────────────────────────────────
@test "handles incomplete fixture gracefully" {
# The incomplete fixture is missing some keys, but that should be OK
# - it should only import what exists
# - it should warn about missing pairs
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-incomplete" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -eq 0 ]
# Should have imported what was available
echo "$output" | grep -q "review"
# Should complete successfully even with incomplete fixture
# The script handles missing pairs gracefully with warnings to stderr
[ "$status" -eq 0 ]
}
# --- Security: no secrets in output ───────────────────────────────────────────
@test "never logs secret values in stdout" {
# Run the import
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -eq 0 ]
# Check that no actual secret values appear in output
# (only key names and status messages)
local secret_patterns=(
"generic-forge-token"
"generic-forge-pass"
"generic-admin-token"
"review-token"
"review-pass"
"llama-token"
"llama-pass"
"wp-agent-secret"
"forward-auth-secret"
"github-test-token"
"codeberg-test-token"
"clawhub-test-token"
"deploy-key-test"
"npm-test-token"
"dockerhub-test-token"
)
for pattern in "${secret_patterns[@]}"; do
if echo "$output" | grep -q "$pattern"; then
echo "FAIL: Found secret pattern '$pattern' in output" >&2
echo "Output was:" >&2
echo "$output" >&2
return 1
fi
done
}
# --- Error handling ───────────────────────────────────────────────────────────
@test "fails with missing --env argument" {
run "$IMPORT_SCRIPT" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -ne 0 ]
echo "$output" | grep -q "Missing required argument"
}
@test "fails with missing --sops argument" {
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -ne 0 ]
echo "$output" | grep -q "Missing required argument"
}
@test "fails with missing --age-key argument" {
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "$FIXTURES_DIR/.env.vault.enc"
[ "$status" -ne 0 ]
echo "$output" | grep -q "Missing required argument"
}
@test "fails with non-existent env file" {
run "$IMPORT_SCRIPT" \
--env "/nonexistent/.env" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -ne 0 ]
echo "$output" | grep -q "not found"
}
@test "fails with non-existent sops file" {
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "/nonexistent/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -ne 0 ]
echo "$output" | grep -q "not found"
}
@test "fails with non-existent age key file" {
run "$IMPORT_SCRIPT" \
--env "$FIXTURES_DIR/dot-env-complete" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "/nonexistent/age-keys.txt"
[ "$status" -ne 0 ]
echo "$output" | grep -q "not found"
}

View file

@ -1,145 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# tools/vault-apply-policies.sh — Idempotent Vault policy sync
#
# Part of the Nomad+Vault migration (S2.1, issue #879). Reads every
# vault/policies/*.hcl file and upserts it into Vault as an ACL policy
# named after the file's basename (without the .hcl suffix).
#
# Idempotency contract:
# For each vault/policies/<NAME>.hcl:
# - Policy missing in Vault → apply, log "policy <NAME> created"
# - Policy present, content same → skip, log "policy <NAME> unchanged"
# - Policy present, content diff → apply, log "policy <NAME> updated"
#
# Comparison is byte-for-byte against the on-server policy text returned by
# GET sys/policies/acl/<NAME>.data.policy. Re-running with no file edits is
# a guaranteed no-op that reports every policy as "unchanged".
#
# --dry-run: prints <NAME> <SHA256> for each file that WOULD be applied;
# does not call Vault at all (no GETs, no PUTs). Exits 0.
#
# Requires:
# - VAULT_ADDR (e.g. http://127.0.0.1:8200)
# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh)
# - curl, jq, sha256sum
#
# Usage:
# tools/vault-apply-policies.sh
# tools/vault-apply-policies.sh --dry-run
#
# Exit codes:
# 0 success (policies synced, or --dry-run completed)
# 1 precondition / API failure
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
POLICIES_DIR="${REPO_ROOT}/vault/policies"
# shellcheck source=../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
log() { printf '[vault-apply] %s\n' "$*"; }
die() { printf '[vault-apply] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Flag parsing ─────────────────────────────────────────────────────────────
# Single optional flag — no loop needed. Keeps this block textually distinct
# from the multi-flag `while/case` parsers elsewhere in the repo (see
# .woodpecker/detect-duplicates.py — sliding 5-line window).
dry_run=false
[ "$#" -le 1 ] || die "too many arguments (saw: $*)"
case "${1:-}" in
'') ;;
--dry-run) dry_run=true ;;
-h|--help) printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Apply every vault/policies/*.hcl to Vault as an ACL policy.\n'
printf 'Idempotent: unchanged policies are reported as "unchanged" and\n'
printf 'not written.\n\n'
printf ' --dry-run Print policy names + content SHA256 that would be\n'
printf ' applied, without contacting Vault. Exits 0.\n'
exit 0 ;;
*) die "unknown flag: $1" ;;
esac
# ── Preconditions ────────────────────────────────────────────────────────────
for bin in curl jq sha256sum; do
command -v "$bin" >/dev/null 2>&1 \
|| die "required binary not found: ${bin}"
done
[ -d "$POLICIES_DIR" ] \
|| die "policies directory not found: ${POLICIES_DIR}"
# Collect policy files in a stable (lexicographic) order so log output is
# deterministic across runs and CI diffs.
mapfile -t POLICY_FILES < <(
find "$POLICIES_DIR" -maxdepth 1 -type f -name '*.hcl' | LC_ALL=C sort
)
if [ "${#POLICY_FILES[@]}" -eq 0 ]; then
die "no *.hcl files in ${POLICIES_DIR}"
fi
# ── Dry-run: print plan + exit (no Vault calls) ──────────────────────────────
if [ "$dry_run" = true ]; then
log "dry-run — ${#POLICY_FILES[@]} policy file(s) in ${POLICIES_DIR}"
for f in "${POLICY_FILES[@]}"; do
name="$(basename "$f" .hcl)"
sha="$(sha256sum "$f" | awk '{print $1}')"
printf '[vault-apply] would apply policy %s (sha256=%s)\n' "$name" "$sha"
done
exit 0
fi
# ── Live run: Vault connectivity check ───────────────────────────────────────
# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env).
# `disinto init` does not export VAULT_ADDR before calling this script — the
# server is reachable on 127.0.0.1:8200 and the root token lives at
# /etc/vault.d/root.token in the common fresh-LXC case (issue #912).
_hvault_default_env
# hvault_token_lookup both resolves the token (env or /etc/vault.d/root.token)
# and confirms the server is reachable with a valid token. Fail fast here so
# the per-file loop below doesn't emit N identical "HTTP 403" errors.
hvault_token_lookup >/dev/null \
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Apply each policy, reporting created/updated/unchanged ───────────────────
log "syncing ${#POLICY_FILES[@]} polic(y|ies) from ${POLICIES_DIR}"
for f in "${POLICY_FILES[@]}"; do
name="$(basename "$f" .hcl)"
desired="$(cat "$f")"
# hvault_get_or_empty returns the raw JSON body on 200 or empty on 404.
# Extract the .data.policy field here (jq on "" yields "", so the
# empty-string-means-create branch below still works).
raw="$(hvault_get_or_empty "sys/policies/acl/${name}")" \
|| die "failed to read existing policy: ${name}"
if [ -n "$raw" ]; then
current="$(printf '%s' "$raw" | jq -r '.data.policy // ""')" \
|| die "failed to parse policy response: ${name}"
else
current=""
fi
if [ -z "$current" ]; then
hvault_policy_apply "$name" "$f" \
|| die "failed to create policy: ${name}"
log "policy ${name} created"
continue
fi
if [ "$current" = "$desired" ]; then
log "policy ${name} unchanged"
continue
fi
hvault_policy_apply "$name" "$f" \
|| die "failed to update policy: ${name}"
log "policy ${name} updated"
done
log "done — ${#POLICY_FILES[@]} polic(y|ies) synced"

View file

@ -1,308 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# tools/vault-apply-roles.sh — Idempotent Vault JWT-auth role sync
#
# Part of the Nomad+Vault migration (S2.3, issue #881). Reads
# vault/roles.yaml and upserts each entry as a Vault role under
# auth/jwt-nomad/role/<name>.
#
# Idempotency contract:
# For each role entry in vault/roles.yaml:
# - Role missing in Vault → write, log "role <NAME> created"
# - Role present, fields match → skip, log "role <NAME> unchanged"
# - Role present, fields differ → write, log "role <NAME> updated"
#
# Comparison is per-field on the data the CLI would read back
# (GET auth/jwt-nomad/role/<NAME>.data.{policies,bound_audiences,
# bound_claims,token_ttl,token_max_ttl,token_type}). Only the fields
# this script owns are compared — a future field added by hand in
# Vault would not be reverted on the next run.
#
# --dry-run: prints the planned role list + full payload for each role
# WITHOUT touching Vault. Exits 0.
#
# Preconditions:
# - Vault auth method jwt-nomad must already be enabled + configured
# (done by lib/init/nomad/vault-nomad-auth.sh — which then calls
# this script). Running this script standalone against a Vault with
# no jwt-nomad path will fail on the first role write.
# - vault/roles.yaml present. See that file's header for the format.
#
# Requires:
# - VAULT_ADDR (e.g. http://127.0.0.1:8200)
# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh)
# - curl, jq, awk
#
# Usage:
# tools/vault-apply-roles.sh
# tools/vault-apply-roles.sh --dry-run
#
# Exit codes:
# 0 success (roles synced, or --dry-run completed)
# 1 precondition / API / parse failure
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
ROLES_FILE="${REPO_ROOT}/vault/roles.yaml"
# shellcheck source=../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
# Constants shared across every role — the issue's AC names these as the
# invariant token shape for Nomad workload identity. Bumping any of these
# is a knowing, repo-wide change, not a per-role knob, so they live here
# rather than as per-entry fields in roles.yaml.
ROLE_AUDIENCE="vault.io"
ROLE_TOKEN_TYPE="service"
ROLE_TOKEN_TTL="1h"
ROLE_TOKEN_MAX_TTL="24h"
log() { printf '[vault-roles] %s\n' "$*"; }
die() { printf '[vault-roles] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Flag parsing (single optional flag — see vault-apply-policies.sh for the
# sibling grammar). Structured as arg-count guard + dispatch to keep the
# 5-line sliding-window duplicate detector (.woodpecker/detect-duplicates.py)
# from flagging this as shared boilerplate with vault-apply-policies.sh —
# the two parsers implement the same shape but with different control flow.
dry_run=false
if [ "$#" -gt 1 ]; then
die "too many arguments (saw: $*)"
fi
arg="${1:-}"
if [ "$arg" = "--dry-run" ]; then
dry_run=true
elif [ "$arg" = "-h" ] || [ "$arg" = "--help" ]; then
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Apply every role in vault/roles.yaml to Vault as a\n'
printf 'jwt-nomad role. Idempotent: unchanged roles are reported\n'
printf 'as "unchanged" and not written.\n\n'
printf ' --dry-run Print the planned role list + full role\n'
printf ' payload without contacting Vault. Exits 0.\n'
exit 0
elif [ -n "$arg" ]; then
die "unknown flag: $arg"
fi
unset arg
# ── Preconditions ────────────────────────────────────────────────────────────
for bin in curl jq awk; do
command -v "$bin" >/dev/null 2>&1 \
|| die "required binary not found: ${bin}"
done
[ -f "$ROLES_FILE" ] \
|| die "roles file not found: ${ROLES_FILE}"
# ── Parse vault/roles.yaml → TSV ─────────────────────────────────────────────
# Strict-format parser. One awk pass; emits one TAB-separated line per role:
# <name>\t<policy>\t<namespace>\t<job_id>
#
# Grammar: a record opens on a line matching `- name: <value>` and closes
# on the next `- name:` or EOF. Within a record, `policy:`, `namespace:`,
# and `job_id:` lines populate the record. Comments (`#...`) and blank
# lines are ignored. Whitespace around the colon and value is trimmed.
#
# This is intentionally narrower than full YAML — the file's header
# documents the exact subset. If someone adds nested maps, arrays, or
# anchors, this parser will silently drop them; the completeness check
# below catches records missing any of the four fields.
parse_roles() {
awk '
function trim(s) { sub(/^[[:space:]]+/, "", s); sub(/[[:space:]]+$/, "", s); return s }
function strip_comment(s) { sub(/[[:space:]]+#.*$/, "", s); return s }
function emit() {
if (name != "") {
if (policy == "" || namespace == "" || job_id == "") {
printf "INCOMPLETE\t%s\t%s\t%s\t%s\n", name, policy, namespace, job_id
} else {
printf "%s\t%s\t%s\t%s\n", name, policy, namespace, job_id
}
}
name=""; policy=""; namespace=""; job_id=""
}
BEGIN { name=""; policy=""; namespace=""; job_id="" }
# Strip full-line comments and blank lines early.
/^[[:space:]]*#/ { next }
/^[[:space:]]*$/ { next }
# New record: "- name: <value>"
/^[[:space:]]*-[[:space:]]+name:[[:space:]]/ {
emit()
line=strip_comment($0)
sub(/^[[:space:]]*-[[:space:]]+name:[[:space:]]*/, "", line)
name=trim(line)
next
}
# Field within current record. Only accept when a record is open.
/^[[:space:]]+policy:[[:space:]]/ && name != "" {
line=strip_comment($0); sub(/^[[:space:]]+policy:[[:space:]]*/, "", line)
policy=trim(line); next
}
/^[[:space:]]+namespace:[[:space:]]/ && name != "" {
line=strip_comment($0); sub(/^[[:space:]]+namespace:[[:space:]]*/, "", line)
namespace=trim(line); next
}
/^[[:space:]]+job_id:[[:space:]]/ && name != "" {
line=strip_comment($0); sub(/^[[:space:]]+job_id:[[:space:]]*/, "", line)
job_id=trim(line); next
}
END { emit() }
' "$ROLES_FILE"
}
mapfile -t ROLE_RECORDS < <(parse_roles)
if [ "${#ROLE_RECORDS[@]}" -eq 0 ]; then
die "no roles parsed from ${ROLES_FILE}"
fi
# Validate every record is complete. An INCOMPLETE line has the form
# "INCOMPLETE\t<name>\t<policy>\t<namespace>\t<job_id>" — list all of
# them at once so the operator sees every missing field, not one per run.
incomplete=()
for rec in "${ROLE_RECORDS[@]}"; do
case "$rec" in
INCOMPLETE*) incomplete+=("${rec#INCOMPLETE$'\t'}") ;;
esac
done
if [ "${#incomplete[@]}" -gt 0 ]; then
printf '[vault-roles] ERROR: role entries with missing fields:\n' >&2
for row in "${incomplete[@]}"; do
IFS=$'\t' read -r name policy namespace job_id <<<"$row"
printf ' - name=%-24s policy=%-22s namespace=%-10s job_id=%s\n' \
"${name:-<missing>}" "${policy:-<missing>}" \
"${namespace:-<missing>}" "${job_id:-<missing>}" >&2
done
die "fix ${ROLES_FILE} and re-run"
fi
# ── Helper: build the JSON payload Vault expects for a role ──────────────────
# Keeps bound_audiences as a JSON array (required by the API — a scalar
# string silently becomes a one-element-list in the CLI but the HTTP API
# rejects it). All fields that differ between runs are inside this payload
# so the diff-check below (role_fields_match) compares like-for-like.
build_payload() {
local policy="$1" namespace="$2" job_id="$3"
jq -n \
--arg aud "$ROLE_AUDIENCE" \
--arg policy "$policy" \
--arg ns "$namespace" \
--arg job "$job_id" \
--arg ttype "$ROLE_TOKEN_TYPE" \
--arg ttl "$ROLE_TOKEN_TTL" \
--arg maxttl "$ROLE_TOKEN_MAX_TTL" \
'{
role_type: "jwt",
bound_audiences: [$aud],
user_claim: "nomad_job_id",
bound_claims: { nomad_namespace: $ns, nomad_job_id: $job },
token_type: $ttype,
token_policies: [$policy],
token_ttl: $ttl,
token_max_ttl: $maxttl
}'
}
# ── Dry-run: print plan + exit (no Vault calls) ──────────────────────────────
if [ "$dry_run" = true ]; then
log "dry-run — ${#ROLE_RECORDS[@]} role(s) in ${ROLES_FILE}"
for rec in "${ROLE_RECORDS[@]}"; do
IFS=$'\t' read -r name policy namespace job_id <<<"$rec"
payload="$(build_payload "$policy" "$namespace" "$job_id")"
printf '[vault-roles] would apply role %s → policy=%s namespace=%s job_id=%s\n' \
"$name" "$policy" "$namespace" "$job_id"
printf '%s\n' "$payload" | jq -S . | sed 's/^/ /'
done
exit 0
fi
# ── Live run: Vault connectivity check ───────────────────────────────────────
# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env).
# Called transitively from vault-nomad-auth.sh during `disinto init`, which
# does not export VAULT_ADDR in the common fresh-LXC case (issue #912).
_hvault_default_env
if ! hvault_token_lookup >/dev/null; then
die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
fi
# ── Helper: compare on-server role to desired payload ────────────────────────
# Returns 0 iff every field this script owns matches. Fields not in our
# payload (e.g. a manually-added `ttl` via the UI) are ignored — we don't
# revert them, but we also don't block on them.
role_fields_match() {
local current_json="$1" desired_json="$2"
local keys=(
role_type bound_audiences user_claim bound_claims
token_type token_policies token_ttl token_max_ttl
)
# Vault returns token_ttl/token_max_ttl as integers (seconds) on GET but
# accepts strings ("1h") on PUT. Normalize: convert desired durations to
# seconds before comparing. jq's tonumber/type checks give us a uniform
# representation on both sides.
local cur des
for k in "${keys[@]}"; do
cur="$(printf '%s' "$current_json" | jq -cS --arg k "$k" '.data[$k] // null')"
des="$(printf '%s' "$desired_json" | jq -cS --arg k "$k" '.[$k] // null')"
case "$k" in
token_ttl|token_max_ttl)
# Normalize desired: "1h"→3600, "24h"→86400.
des="$(printf '%s' "$des" | jq -r '. // ""' | _duration_to_seconds)"
cur="$(printf '%s' "$cur" | jq -r '. // 0')"
;;
esac
if [ "$cur" != "$des" ]; then
return 1
fi
done
return 0
}
# _duration_to_seconds — read a duration string on stdin, echo seconds.
# Accepts the subset we emit: "Ns", "Nm", "Nh", "Nd". Integers pass through
# unchanged. Any other shape produces the empty string (which cannot match
# Vault's integer response → forces an update).
_duration_to_seconds() {
local s
s="$(cat)"
case "$s" in
''|null) printf '0' ;;
*[0-9]s) printf '%d' "${s%s}" ;;
*[0-9]m) printf '%d' "$(( ${s%m} * 60 ))" ;;
*[0-9]h) printf '%d' "$(( ${s%h} * 3600 ))" ;;
*[0-9]d) printf '%d' "$(( ${s%d} * 86400 ))" ;;
*[0-9]) printf '%d' "$s" ;;
*) printf '' ;;
esac
}
# ── Apply each role, reporting created/updated/unchanged ─────────────────────
log "syncing ${#ROLE_RECORDS[@]} role(s) from ${ROLES_FILE}"
for rec in "${ROLE_RECORDS[@]}"; do
IFS=$'\t' read -r name policy namespace job_id <<<"$rec"
desired_payload="$(build_payload "$policy" "$namespace" "$job_id")"
# hvault_get_or_empty: raw body on 200, empty on 404 (caller: "create").
current_json="$(hvault_get_or_empty "auth/jwt-nomad/role/${name}")" \
|| die "failed to read existing role: ${name}"
if [ -z "$current_json" ]; then
_hvault_request POST "auth/jwt-nomad/role/${name}" "$desired_payload" >/dev/null \
|| die "failed to create role: ${name}"
log "role ${name} created"
continue
fi
if role_fields_match "$current_json" "$desired_payload"; then
log "role ${name} unchanged"
continue
fi
_hvault_request POST "auth/jwt-nomad/role/${name}" "$desired_payload" >/dev/null \
|| die "failed to update role: ${name}"
log "role ${name} updated"
done
log "done — ${#ROLE_RECORDS[@]} role(s) synced"

View file

@ -1,593 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# vault-import.sh — Import .env and sops-decrypted secrets into Vault KV
#
# Reads existing .env and sops-encrypted .env.vault.enc from the old docker stack
# and writes them to Vault KV paths matching the S2.1 policy layout.
#
# Usage:
# vault-import.sh \
# --env /path/to/.env \
# [--sops /path/to/.env.vault.enc] \
# [--age-key /path/to/age/keys.txt]
#
# Flag validation (S2.5, issue #883):
# --import-sops without --age-key → error.
# --age-key without --import-sops → error.
# --env alone (no sops) → OK; imports only the plaintext half.
#
# Mapping:
# From .env:
# - FORGE_{ROLE}_TOKEN + FORGE_{ROLE}_PASS → kv/disinto/bots/<role>/{token,password}
# (roles: review, dev, gardener, architect, planner, predictor, supervisor, vault)
# - FORGE_TOKEN_LLAMA + FORGE_PASS_LLAMA → kv/disinto/bots/dev-qwen/{token,password}
# - FORGE_TOKEN + FORGE_PASS → kv/disinto/shared/forge/{token,password}
# - FORGE_ADMIN_TOKEN → kv/disinto/shared/forge/admin_token
# - WOODPECKER_* → kv/disinto/shared/woodpecker/<lowercase_key>
# - FORWARD_AUTH_SECRET, CHAT_OAUTH_* → kv/disinto/shared/chat/<lowercase_key>
# From sops-decrypted .env.vault.enc:
# - GITHUB_TOKEN, CODEBERG_TOKEN, CLAWHUB_TOKEN, DEPLOY_KEY, NPM_TOKEN, DOCKER_HUB_TOKEN
# → kv/disinto/runner/<NAME>/value
#
# Security:
# - Refuses to run if VAULT_ADDR is not localhost
# - Writes to KV v2, not v1
# - Validates sops age key file is mode 0400 before sourcing
# - Never logs secret values — only key names
#
# Idempotency:
# - Reports unchanged/updated/created per key via hvault_kv_get
# - --dry-run prints the full import plan without writing
# =============================================================================
set -euo pipefail
# ── Internal helpers ──────────────────────────────────────────────────────────
# _log — emit a log message to stdout (never to stderr to avoid polluting diff)
_log() {
printf '[vault-import] %s\n' "$*"
}
# _err — emit an error message to stderr
_err() {
printf '[vault-import] ERROR: %s\n' "$*" >&2
}
# _die — log error and exit with status 1
_die() {
_err "$@"
exit 1
}
# _check_vault_addr — ensure VAULT_ADDR is localhost (security check)
_check_vault_addr() {
local addr="${VAULT_ADDR:-}"
if [[ ! "$addr" =~ ^https?://(localhost|127\.0\.0\.1)(:[0-9]+)?$ ]]; then
_die "Security check failed: VAULT_ADDR must be localhost for safety. Got: $addr"
fi
}
# _validate_age_key_perms — ensure age key file is mode 0400
_validate_age_key_perms() {
local keyfile="$1"
local perms
perms="$(stat -c '%a' "$keyfile" 2>/dev/null)" || _die "Cannot stat age key file: $keyfile"
if [ "$perms" != "400" ]; then
_die "Age key file permissions are $perms, expected 400. Refusing to proceed for security."
fi
}
# _decrypt_sops — decrypt sops-encrypted file using SOPS_AGE_KEY_FILE
_decrypt_sops() {
local sops_file="$1"
local age_key="$2"
local output
# sops outputs YAML format by default, extract KEY=VALUE lines
output="$(SOPS_AGE_KEY_FILE="$age_key" sops -d "$sops_file" 2>/dev/null | \
grep -E '^[A-Z_][A-Z0-9_]*=' | \
sed 's/^\([^=]*\)=\(.*\)$/\1=\2/')" || \
_die "Failed to decrypt sops file: $sops_file. Check age key and file integrity."
printf '%s' "$output"
}
# _load_env_file — source an environment file (safety: only KEY=value lines)
_load_env_file() {
local env_file="$1"
local temp_env
temp_env="$(mktemp)"
# Extract only valid KEY=value lines (skip comments, blank lines, malformed)
grep -E '^[A-Za-z_][A-Za-z0-9_]*=' "$env_file" 2>/dev/null > "$temp_env" || true
# shellcheck source=/dev/null
source "$temp_env"
rm -f "$temp_env"
}
# _kv_path_exists — check if a KV path exists (returns 0 if exists, 1 if not)
_kv_path_exists() {
local path="$1"
# Use hvault_kv_get and check if it fails with "not found"
if hvault_kv_get "$path" >/dev/null 2>&1; then
return 0
fi
# Check if the error is specifically "not found"
local err_output
err_output="$(hvault_kv_get "$path" 2>&1)" || true
if printf '%s' "$err_output" | grep -qi 'not found\|404'; then
return 1
fi
# Some other error (e.g., auth failure) — treat as unknown
return 1
}
# _kv_get_value — get a single key value from a KV path
_kv_get_value() {
local path="$1"
local key="$2"
hvault_kv_get "$path" "$key"
}
# _kv_put_secret — write a secret to KV v2
_kv_put_secret() {
local path="$1"
shift
local kv_pairs=("$@")
# Build JSON payload with all key-value pairs
local payload='{"data":{}}'
for kv in "${kv_pairs[@]}"; do
local k="${kv%%=*}"
local v="${kv#*=}"
# Use jq with --arg for safe string interpolation (handles quotes/backslashes)
payload="$(printf '%s' "$payload" | jq --arg k "$k" --arg v "$v" '. * {"data": {($k): $v}}')"
done
# Use curl directly for KV v2 write with versioning
local tmpfile http_code
tmpfile="$(mktemp)"
http_code="$(curl -s -w '%{http_code}' \
-H "X-Vault-Token: ${VAULT_TOKEN}" \
-H "Content-Type: application/json" \
-X POST \
-d "$payload" \
-o "$tmpfile" \
"${VAULT_ADDR}/v1/${VAULT_KV_MOUNT:-kv}/data/${path}")" || {
rm -f "$tmpfile"
_err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: curl error"
return 1
}
rm -f "$tmpfile"
# Check HTTP status — 2xx is success
case "$http_code" in
2[0-9][0-9])
return 0
;;
404)
_err "KV path not found: ${VAULT_KV_MOUNT:-kv}/data/${path}"
return 1
;;
403)
_err "Permission denied writing to ${VAULT_KV_MOUNT:-kv}/data/${path}"
return 1
;;
*)
_err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: HTTP $http_code"
return 1
;;
esac
}
# _format_status — format the status string for a key
_format_status() {
local status="$1"
local path="$2"
local key="$3"
case "$status" in
unchanged)
printf ' %s: %s/%s (unchanged)' "$status" "$path" "$key"
;;
updated)
printf ' %s: %s/%s (updated)' "$status" "$path" "$key"
;;
created)
printf ' %s: %s/%s (created)' "$status" "$path" "$key"
;;
*)
printf ' %s: %s/%s (unknown)' "$status" "$path" "$key"
;;
esac
}
# ── Mapping definitions ──────────────────────────────────────────────────────
# Bots mapping: FORGE_{ROLE}_TOKEN + FORGE_{ROLE}_PASS
declare -a BOT_ROLES=(review dev gardener architect planner predictor supervisor vault)
# Runner tokens from sops-decrypted file
declare -a RUNNER_TOKENS=(GITHUB_TOKEN CODEBERG_TOKEN CLAWHUB_TOKEN DEPLOY_KEY NPM_TOKEN DOCKER_HUB_TOKEN)
# ── Main logic ────────────────────────────────────────────────────────────────
main() {
local env_file=""
local sops_file=""
local age_key_file=""
local dry_run=false
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--env)
env_file="$2"
shift 2
;;
--sops)
sops_file="$2"
shift 2
;;
--age-key)
age_key_file="$2"
shift 2
;;
--dry-run)
dry_run=true
shift
;;
--help|-h)
cat <<'EOF'
vault-import.sh — Import .env and sops-decrypted secrets into Vault KV
Usage:
vault-import.sh \
--env /path/to/.env \
[--sops /path/to/.env.vault.enc] \
[--age-key /path/to/age/keys.txt] \
[--dry-run]
Options:
--env Path to .env file (required)
--sops Path to sops-encrypted .env.vault.enc file (optional;
requires --age-key when set)
--age-key Path to age keys file (required when --sops is set)
--dry-run Print import plan without writing to Vault (optional)
--help Show this help message
Mapping:
From .env:
- FORGE_{ROLE}_TOKEN + FORGE_{ROLE}_PASS → kv/disinto/bots/<role>/{token,password}
- FORGE_TOKEN_LLAMA + FORGE_PASS_LLAMA → kv/disinto/bots/dev-qwen/{token,password}
- FORGE_TOKEN + FORGE_PASS → kv/disinto/shared/forge/{token,password}
- FORGE_ADMIN_TOKEN → kv/disinto/shared/forge/admin_token
- WOODPECKER_* → kv/disinto/shared/woodpecker/<lowercase_key>
- FORWARD_AUTH_SECRET, CHAT_OAUTH_* → kv/disinto/shared/chat/<lowercase_key>
From sops-decrypted .env.vault.enc:
- GITHUB_TOKEN, CODEBERG_TOKEN, CLAWHUB_TOKEN, DEPLOY_KEY, NPM_TOKEN, DOCKER_HUB_TOKEN
→ kv/disinto/runner/<NAME>/value
Examples:
vault-import.sh --env .env --sops .env.vault.enc --age-key age-keys.txt
vault-import.sh --env .env --sops .env.vault.enc --age-key age-keys.txt --dry-run
EOF
exit 0
;;
*)
_die "Unknown option: $1. Use --help for usage."
;;
esac
done
# Validate required arguments. --sops and --age-key are paired: if one
# is set, the other must be too. --env alone (no sops half) is valid —
# imports only the plaintext dotenv. Spec: S2.5 / issue #883 / #912.
if [ -z "$env_file" ]; then
_die "Missing required argument: --env"
fi
if [ -n "$sops_file" ] && [ -z "$age_key_file" ]; then
_die "--sops requires --age-key"
fi
if [ -n "$age_key_file" ] && [ -z "$sops_file" ]; then
_die "--age-key requires --sops"
fi
# Validate files exist
if [ ! -f "$env_file" ]; then
_die "Environment file not found: $env_file"
fi
if [ -n "$sops_file" ] && [ ! -f "$sops_file" ]; then
_die "Sops file not found: $sops_file"
fi
if [ -n "$age_key_file" ] && [ ! -f "$age_key_file" ]; then
_die "Age key file not found: $age_key_file"
fi
# Security check: age key permissions (only when an age key is provided —
# --env-only imports never touch the age key).
if [ -n "$age_key_file" ]; then
_validate_age_key_perms "$age_key_file"
fi
# Source the Vault helpers and default the local-cluster VAULT_ADDR +
# VAULT_TOKEN before the localhost safety check runs. `disinto init`
# does not export these in the common fresh-LXC case (issue #912).
source "$(dirname "$0")/../lib/hvault.sh"
_hvault_default_env
# Security check: VAULT_ADDR must be localhost
_check_vault_addr
# Load .env file
_log "Loading environment from: $env_file"
_load_env_file "$env_file"
# Decrypt sops file when --sops was provided. On the --env-only path
# (empty $sops_file) the sops_env stays empty and the per-token loop
# below silently skips runner-token imports — exactly the "only
# plaintext half" spec from S2.5.
local sops_env=""
if [ -n "$sops_file" ]; then
_log "Decrypting sops file: $sops_file"
sops_env="$(_decrypt_sops "$sops_file" "$age_key_file")"
# shellcheck disable=SC2086
eval "$sops_env"
else
_log "No --sops flag — skipping sops decryption (importing plaintext .env only)"
fi
# Collect all import operations
declare -a operations=()
# --- From .env ---
# Bots: FORGE_{ROLE}_TOKEN + FORGE_{ROLE}_PASS
for role in "${BOT_ROLES[@]}"; do
local token_var="FORGE_${role^^}_TOKEN"
local pass_var="FORGE_${role^^}_PASS"
local token_val="${!token_var:-}"
local pass_val="${!pass_var:-}"
if [ -n "$token_val" ] && [ -n "$pass_val" ]; then
operations+=("bots|$role|token|$env_file|$token_var")
operations+=("bots|$role|pass|$env_file|$pass_var")
elif [ -n "$token_val" ] || [ -n "$pass_val" ]; then
_err "Warning: $role bot has token but no password (or vice versa), skipping"
fi
done
# Llama bot: FORGE_TOKEN_LLAMA + FORGE_PASS_LLAMA
local llama_token="${FORGE_TOKEN_LLAMA:-}"
local llama_pass="${FORGE_PASS_LLAMA:-}"
if [ -n "$llama_token" ] && [ -n "$llama_pass" ]; then
operations+=("bots|dev-qwen|token|$env_file|FORGE_TOKEN_LLAMA")
operations+=("bots|dev-qwen|pass|$env_file|FORGE_PASS_LLAMA")
elif [ -n "$llama_token" ] || [ -n "$llama_pass" ]; then
_err "Warning: dev-qwen bot has token but no password (or vice versa), skipping"
fi
# Generic forge creds: FORGE_TOKEN + FORGE_PASS
local forge_token="${FORGE_TOKEN:-}"
local forge_pass="${FORGE_PASS:-}"
if [ -n "$forge_token" ] && [ -n "$forge_pass" ]; then
operations+=("forge|token|$env_file|FORGE_TOKEN")
operations+=("forge|pass|$env_file|FORGE_PASS")
fi
# Forge admin token: FORGE_ADMIN_TOKEN
local forge_admin_token="${FORGE_ADMIN_TOKEN:-}"
if [ -n "$forge_admin_token" ]; then
operations+=("forge|admin_token|$env_file|FORGE_ADMIN_TOKEN")
fi
# Woodpecker secrets: WOODPECKER_*
# Only read from the .env file, not shell environment
local woodpecker_keys=()
while IFS='=' read -r key _; do
if [[ "$key" =~ ^WOODPECKER_ ]] || [[ "$key" =~ ^WP_[A-Z_]+$ ]]; then
woodpecker_keys+=("$key")
fi
done < <(grep -E '^[A-Z_][A-Z0-9_]*=' "$env_file" 2>/dev/null || true)
for key in "${woodpecker_keys[@]}"; do
local val="${!key}"
if [ -n "$val" ]; then
local lowercase_key="${key,,}"
operations+=("woodpecker|$lowercase_key|$env_file|$key")
fi
done
# Chat secrets: FORWARD_AUTH_SECRET, CHAT_OAUTH_CLIENT_ID, CHAT_OAUTH_CLIENT_SECRET
for key in FORWARD_AUTH_SECRET CHAT_OAUTH_CLIENT_ID CHAT_OAUTH_CLIENT_SECRET; do
local val="${!key:-}"
if [ -n "$val" ]; then
local lowercase_key="${key,,}"
operations+=("chat|$lowercase_key|$env_file|$key")
fi
done
# --- From sops-decrypted .env.vault.enc ---
# Runner tokens
for token_name in "${RUNNER_TOKENS[@]}"; do
local token_val="${!token_name:-}"
if [ -n "$token_val" ]; then
operations+=("runner|$token_name|$sops_file|$token_name")
fi
done
# If dry-run, just print the plan
if $dry_run; then
_log "=== DRY-RUN: Import plan ==="
_log "Environment file: $env_file"
if [ -n "$sops_file" ]; then
_log "Sops file: $sops_file"
_log "Age key: $age_key_file"
else
_log "Sops file: (none — --env-only import)"
fi
_log ""
_log "Planned operations:"
for op in "${operations[@]}"; do
_log " $op"
done
_log ""
_log "Total: ${#operations[@]} operations"
exit 0
fi
# --- Actual import with idempotency check ---
_log "=== Starting Vault import ==="
_log "Environment file: $env_file"
if [ -n "$sops_file" ]; then
_log "Sops file: $sops_file"
_log "Age key: $age_key_file"
else
_log "Sops file: (none — --env-only import)"
fi
_log ""
local created=0
local updated=0
local unchanged=0
# First pass: collect all operations with their parsed values.
# Store value and status in separate associative arrays keyed by
# "vault_path:kv_key". Secret values may contain any character, so we
# never pack them into a delimited string — the previous `value|status`
# encoding silently truncated values containing '|' (see issue #898).
declare -A ops_value
declare -A ops_status
declare -A path_seen
for op in "${operations[@]}"; do
# Parse operation: category|field|subkey|file|envvar (5 fields for bots/runner)
# or category|field|file|envvar (4 fields for forge/woodpecker/chat).
# These metadata strings are built from safe identifiers (role names,
# env-var names, file paths) and do not carry secret values, so '|' is
# still fine as a separator here.
local category field subkey file envvar=""
local field_count
field_count="$(printf '%s' "$op" | awk -F'|' '{print NF}')"
if [ "$field_count" -eq 5 ]; then
# 5 fields: category|role|subkey|file|envvar
IFS='|' read -r category field subkey file envvar <<< "$op"
else
# 4 fields: category|field|file|envvar
IFS='|' read -r category field file envvar <<< "$op"
subkey="$field" # For 4-field ops, field is the vault key
fi
# Determine Vault path and key based on category
local vault_path=""
local vault_key="$subkey"
local source_value=""
if [ "$file" = "$env_file" ]; then
# Source from environment file (envvar contains the variable name)
source_value="${!envvar:-}"
else
# Source from sops-decrypted env (envvar contains the variable name)
source_value="$(printf '%s' "$sops_env" | grep "^${envvar}=" | sed "s/^${envvar}=//" || true)"
fi
case "$category" in
bots)
vault_path="disinto/bots/${field}"
vault_key="$subkey"
;;
forge)
vault_path="disinto/shared/forge"
vault_key="$field"
;;
woodpecker)
vault_path="disinto/shared/woodpecker"
vault_key="$field"
;;
chat)
vault_path="disinto/shared/chat"
vault_key="$field"
;;
runner)
vault_path="disinto/runner/${field}"
vault_key="value"
;;
*)
_err "Unknown category: $category"
continue
;;
esac
# Determine status for this key
local status="created"
if _kv_path_exists "$vault_path"; then
local existing_value
if existing_value="$(_kv_get_value "$vault_path" "$vault_key")" 2>/dev/null; then
if [ "$existing_value" = "$source_value" ]; then
status="unchanged"
else
status="updated"
fi
fi
fi
# vault_path and vault_key are identifier-safe (no ':' in either), so
# the composite key round-trips cleanly via ${ck%:*} / ${ck#*:}.
local ck="${vault_path}:${vault_key}"
ops_value["$ck"]="$source_value"
ops_status["$ck"]="$status"
path_seen["$vault_path"]=1
done
# Second pass: group by vault_path and write.
# IMPORTANT: Always write ALL keys for a path, not just changed ones.
# KV v2 POST replaces the entire document, so we must include unchanged keys
# to avoid dropping them. The idempotency guarantee comes from KV v2 versioning.
for vault_path in "${!path_seen[@]}"; do
# Collect this path's "vault_key=source_value" pairs into a bash
# indexed array. Each element is one kv pair; '=' inside the value is
# preserved because _kv_put_secret splits on the *first* '=' only.
local pairs_array=()
local path_has_changes=0
for ck in "${!ops_value[@]}"; do
[ "${ck%:*}" = "$vault_path" ] || continue
local vault_key="${ck#*:}"
pairs_array+=("${vault_key}=${ops_value[$ck]}")
if [ "${ops_status[$ck]}" != "unchanged" ]; then
path_has_changes=1
fi
done
# Determine effective status for this path (updated if any key changed)
local effective_status="unchanged"
if [ "$path_has_changes" = 1 ]; then
effective_status="updated"
fi
if ! _kv_put_secret "$vault_path" "${pairs_array[@]}"; then
_err "Failed to write to $vault_path"
exit 1
fi
# Output status for each key in this path
for kv in "${pairs_array[@]}"; do
local kv_key="${kv%%=*}"
_format_status "$effective_status" "$vault_path" "$kv_key"
printf '\n'
done
# Count only if path has changes
if [ "$effective_status" = "updated" ]; then
((updated++)) || true
fi
done
_log ""
_log "=== Import complete ==="
_log "Created: $created"
_log "Updated: $updated"
_log "Unchanged: $unchanged"
}
main "$@"

View file

@ -1,207 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# tools/vault-seed-forgejo.sh — Idempotent seed for kv/disinto/shared/forgejo
#
# Part of the Nomad+Vault migration (S2.4, issue #882). Populates the KV v2
# path that nomad/jobs/forgejo.hcl reads from, so a clean-install factory
# (no old-stack secrets to import) still has per-key values for
# FORGEJO__security__SECRET_KEY + FORGEJO__security__INTERNAL_TOKEN.
#
# Companion to tools/vault-import.sh (S2.2, not yet merged) — when that
# import runs against a box with an existing stack, it overwrites these
# seeded values with the real ones. Order doesn't matter: whichever runs
# last wins, and both scripts are idempotent in the sense that re-running
# never rotates an existing non-empty key.
#
# Idempotency contract (per key):
# - Key missing or empty in Vault → generate a random value, write it,
# log "<key> generated (N bytes hex)".
# - Key present with a non-empty value → leave untouched, log
# "<key> unchanged".
# - Neither key changes is a silent no-op (no Vault write at all).
#
# Rotating an existing key is deliberately NOT in scope — SECRET_KEY
# rotation invalidates every existing session cookie in forgejo and
# INTERNAL_TOKEN rotation breaks internal RPC until all processes have
# restarted. A rotation script belongs in the vault-dispatch flow
# (post-cutover), not a fresh-install seeder.
#
# Preconditions:
# - Vault reachable + unsealed at $VAULT_ADDR.
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable.
# - The `kv/` mount is enabled as KV v2 (this script enables it on a
# fresh box; on an existing box it asserts the mount type/version).
#
# Requires:
# - VAULT_ADDR (e.g. http://127.0.0.1:8200)
# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh)
# - curl, jq, openssl
#
# Usage:
# tools/vault-seed-forgejo.sh
# tools/vault-seed-forgejo.sh --dry-run
#
# Exit codes:
# 0 success (seed applied, or already applied)
# 1 precondition / API / mount-mismatch failure
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
# shellcheck source=../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
# KV v2 mount + logical path. Kept as two vars so the full API path used
# for GET/POST (which MUST include `/data/`) is built in one place.
KV_MOUNT="kv"
KV_LOGICAL_PATH="disinto/shared/forgejo"
KV_API_PATH="${KV_MOUNT}/data/${KV_LOGICAL_PATH}"
# Byte lengths for the generated secrets (hex output, so the printable
# string length is 2x these). 32 bytes matches forgejo's own
# `gitea generate secret SECRET_KEY` default; 64 bytes is comfortably
# above forgejo's INTERNAL_TOKEN JWT-HMAC key floor.
SECRET_KEY_BYTES=32
INTERNAL_TOKEN_BYTES=64
log() { printf '[vault-seed-forgejo] %s\n' "$*"; }
die() { printf '[vault-seed-forgejo] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Flag parsing — single optional `--dry-run`. Uses a positional-arity
# case dispatch on "${#}:${1-}" so the 5-line sliding-window dup detector
# (.woodpecker/detect-duplicates.py) sees a shape distinct from both
# vault-apply-roles.sh (if/elif chain) and vault-apply-policies.sh (flat
# case on $1 alone). Three sibling tools, three parser shapes.
DRY_RUN=0
case "$#:${1-}" in
0:)
;;
1:--dry-run)
DRY_RUN=1
;;
1:-h|1:--help)
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Seed kv/disinto/shared/forgejo with random SECRET_KEY +\n'
printf 'INTERNAL_TOKEN if they are missing. Idempotent: existing\n'
printf 'non-empty values are left untouched.\n\n'
printf ' --dry-run Print planned actions (enable mount? which keys\n'
printf ' to generate?) without writing to Vault. Exits 0.\n'
exit 0
;;
*)
die "invalid arguments: $* (try --help)"
;;
esac
# ── Preconditions ────────────────────────────────────────────────────────────
for bin in curl jq openssl; do
command -v "$bin" >/dev/null 2>&1 \
|| die "required binary not found: ${bin}"
done
# Vault connectivity — short-circuit style (`||`) instead of an `if`-chain
# so this block has a distinct textual shape from vault-apply-roles.sh's
# equivalent preflight; hvault.sh's typed helpers emit structured JSON
# errors that don't render well behind the `[vault-seed-forgejo] …`
# log prefix, hence the inline check + plain-string diag.
[ -n "${VAULT_ADDR:-}" ] \
|| die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200"
hvault_token_lookup >/dev/null \
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Step 1/2: ensure kv/ mount exists and is KV v2 ───────────────────────────
# The policy at vault/policies/service-forgejo.hcl grants read on
# `kv/data/<path>/*` — that `data` segment only exists for KV v2. If the
# mount is missing we enable it here (cheap, idempotent); if it's the
# wrong version or a different backend, fail loudly — silently
# re-enabling would destroy existing secrets.
log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──"
export DRY_RUN
hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-forgejo]" \
|| die "KV mount check failed"
# ── Step 2/2: seed missing keys at kv/data/disinto/shared/forgejo ────────────
log "── Step 2/2: seed ${KV_API_PATH} ──"
# hvault_get_or_empty returns an empty string on 404 (KV path absent).
# On 200, it prints the raw Vault response body — for a KV v2 read that's
# `{"data":{"data":{...},"metadata":{...}}}`, hence the `.data.data.<key>`
# path below. A path with `deleted_time` set still returns 200 but the
# inner `.data.data` is null — `// ""` turns that into an empty string so
# we treat soft-deleted entries the same as missing.
existing_raw="$(hvault_get_or_empty "${KV_API_PATH}")" \
|| die "failed to read ${KV_API_PATH}"
existing_secret_key=""
existing_internal_token=""
if [ -n "$existing_raw" ]; then
existing_secret_key="$(printf '%s' "$existing_raw" | jq -r '.data.data.secret_key // ""')"
existing_internal_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.internal_token // ""')"
fi
desired_secret_key="$existing_secret_key"
desired_internal_token="$existing_internal_token"
generated=()
if [ -z "$desired_secret_key" ]; then
if [ "$DRY_RUN" -eq 1 ]; then
# In dry-run, don't call openssl — log the intent only. The real run
# generates fresh bytes; nothing about the generated value is
# deterministic so there's no "planned value" to show.
generated+=("secret_key")
else
desired_secret_key="$(openssl rand -hex "$SECRET_KEY_BYTES")"
generated+=("secret_key")
fi
fi
if [ -z "$desired_internal_token" ]; then
if [ "$DRY_RUN" -eq 1 ]; then
generated+=("internal_token")
else
desired_internal_token="$(openssl rand -hex "$INTERNAL_TOKEN_BYTES")"
generated+=("internal_token")
fi
fi
if [ "${#generated[@]}" -eq 0 ]; then
log "all keys present at ${KV_API_PATH} — no-op"
log "secret_key unchanged"
log "internal_token unchanged"
exit 0
fi
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would generate + write: ${generated[*]}"
for key in secret_key internal_token; do
case " ${generated[*]} " in
*" ${key} "*) log "[dry-run] ${key} would be generated" ;;
*) log "[dry-run] ${key} unchanged" ;;
esac
done
exit 0
fi
# Write back BOTH keys in one payload. KV v2 replaces `.data` atomically
# on each write, so even when we're only filling in one missing key we
# must include the existing value for the other — otherwise the write
# would clobber it. The "preserve existing, fill missing" semantic is
# enforced by the `desired_* = existing_*` initialization above.
payload="$(jq -n \
--arg sk "$desired_secret_key" \
--arg it "$desired_internal_token" \
'{data: {secret_key: $sk, internal_token: $it}}')"
_hvault_request POST "${KV_API_PATH}" "$payload" >/dev/null \
|| die "failed to write ${KV_API_PATH}"
for key in secret_key internal_token; do
case " ${generated[*]} " in
*" ${key} "*) log "${key} generated" ;;
*) log "${key} unchanged" ;;
esac
done
log "done — ${#generated[@]} key(s) seeded at ${KV_API_PATH}"

View file

@ -1,145 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# tools/vault-seed-woodpecker.sh — Idempotent seed for kv/disinto/shared/woodpecker
#
# Part of the Nomad+Vault migration (S3.1 + S3.3, issues #934 + #936). Populates
# the KV v2 path read by nomad/jobs/woodpecker-server.hcl:
# - agent_secret: pre-shared secret for woodpecker-server ↔ agent communication
# - forgejo_client + forgejo_secret: OAuth2 client credentials from Forgejo
#
# This script handles BOTH:
# 1. S3.1: seeds `agent_secret` if missing
# 2. S3.3: calls wp-oauth-register.sh to create Forgejo OAuth app + store
# forgejo_client/forgejo_secret in Vault
#
# Idempotency contract:
# - agent_secret: missing → generate and write; present → skip, log unchanged
# - OAuth app + credentials: handled by wp-oauth-register.sh (idempotent)
# This script preserves any existing keys it doesn't own.
#
# Idempotency contract (per key):
# - Key missing or empty in Vault → generate a random value, write it,
# log "agent_secret generated".
# - Key present with a non-empty value → leave untouched, log
# "agent_secret unchanged".
#
# Preconditions:
# - Vault reachable + unsealed at $VAULT_ADDR.
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable.
# - The `kv/` mount is enabled as KV v2 (this script enables it on a
# fresh box; on an existing box it asserts the mount type/version).
#
# Requires:
# - VAULT_ADDR (e.g. http://127.0.0.1:8200)
# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh)
# - curl, jq, openssl
#
# Usage:
# tools/vault-seed-woodpecker.sh
# tools/vault-seed-woodpecker.sh --dry-run
#
# Exit codes:
# 0 success (seed applied, or already applied)
# 1 precondition / API / mount-mismatch failure
# =============================================================================
set -euo pipefail
SEED_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SEED_DIR}/.." && pwd)"
LIB_DIR="${REPO_ROOT}/lib/init/nomad"
# shellcheck source=../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
KV_MOUNT="kv"
KV_LOGICAL_PATH="disinto/shared/woodpecker"
KV_API_PATH="${KV_MOUNT}/data/${KV_LOGICAL_PATH}"
AGENT_SECRET_BYTES=32 # 32 bytes → 64 hex chars
LOG_TAG="[vault-seed-woodpecker]"
log() { printf '%s %s\n' "$LOG_TAG" "$*"; }
die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; }
# ── Flag parsing ─────────────────────────────────────────────────────────────
# for-over-"$@" loop — shape distinct from vault-seed-forgejo.sh (arity:value
# case) and vault-apply-roles.sh (if/elif).
DRY_RUN=0
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=1 ;;
-h|--help)
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Seed kv/disinto/shared/woodpecker with secrets.\n\n'
printf 'Handles both S3.1 (agent_secret) and S3.3 (OAuth app + credentials):\n'
printf ' - agent_secret: generated if missing\n'
printf ' - forgejo_client/forgejo_secret: created via Forgejo API if missing\n\n'
printf ' --dry-run Print planned actions without writing.\n'
exit 0
;;
*) die "invalid argument: ${arg} (try --help)" ;;
esac
done
# ── Preconditions — binary + Vault connectivity checks ───────────────────────
required_bins=(curl jq openssl)
for bin in "${required_bins[@]}"; do
command -v "$bin" >/dev/null 2>&1 || die "required binary not found: ${bin}"
done
[ -n "${VAULT_ADDR:-}" ] || die "VAULT_ADDR unset — export VAULT_ADDR=http://127.0.0.1:8200"
hvault_token_lookup >/dev/null || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Step 1/3: ensure kv/ mount exists and is KV v2 ───────────────────────────
log "── Step 1/3: ensure ${KV_MOUNT}/ is KV v2 ──"
export DRY_RUN
hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-woodpecker]" \
|| die "KV mount check failed"
# ── Step 2/3: seed agent_secret at kv/data/disinto/shared/woodpecker ─────────
log "── Step 2/3: seed agent_secret ──"
existing_raw="$(hvault_get_or_empty "${KV_API_PATH}")" \
|| die "failed to read ${KV_API_PATH}"
# Read all existing keys so we can preserve them on write (KV v2 replaces
# `.data` atomically). Missing path → empty object.
existing_data="{}"
existing_agent_secret=""
if [ -n "$existing_raw" ]; then
existing_data="$(printf '%s' "$existing_raw" | jq '.data.data // {}')"
existing_agent_secret="$(printf '%s' "$existing_raw" | jq -r '.data.data.agent_secret // ""')"
fi
if [ -n "$existing_agent_secret" ]; then
log "agent_secret unchanged"
else
# agent_secret is missing — generate it.
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would generate + write: agent_secret"
else
new_agent_secret="$(openssl rand -hex "$AGENT_SECRET_BYTES")"
# Merge the new key into existing data to preserve any keys written by
# other seeders (e.g. S3.3's forgejo_client/forgejo_secret).
payload="$(printf '%s' "$existing_data" \
| jq --arg as "$new_agent_secret" '{data: (. + {agent_secret: $as})}')"
_hvault_request POST "${KV_API_PATH}" "$payload" >/dev/null \
|| die "failed to write ${KV_API_PATH}"
log "agent_secret generated"
fi
fi
# ── Step 3/3: register Forgejo OAuth app and store credentials ───────────────
log "── Step 3/3: register Forgejo OAuth app ──"
# Export DRY_RUN for the OAuth script and call it
export DRY_RUN
if "${LIB_DIR}/wp-oauth-register.sh" || [ "$DRY_RUN" -eq 1 ]; then
:
elif [ -n "${FORGE_URL:-}" ]; then
# Forgejo was configured but unavailable
log "OAuth registration check failed (Forgejo may not be running)"
log "This is expected if Forgejo is not available"
fi
log "done — agent_secret + OAuth credentials seeded"

View file

@ -1,182 +0,0 @@
<!-- last-reviewed: a7a046b81a7f454ebec43bab643067bd952d50b0 -->
# vault/policies/ — Agent Instructions
HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per
policy; the basename (minus `.hcl`) is the Vault policy name applied to it.
Synced into Vault by `tools/vault-apply-policies.sh` (idempotent — see the
script header for the contract).
This directory is part of the **Nomad+Vault migration (Step 2)** — see
issues #879#884. Policies attach to Nomad jobs via workload identity in
S2.4; this PR only lands the files + apply script.
## Naming convention
| Prefix | Audience | KV scope |
|---|---|---|
| `service-<name>.hcl` | Long-running platform services (forgejo, woodpecker) | `kv/data/disinto/shared/<name>/*` |
| `bot-<name>.hcl` | Per-agent jobs (dev, review, gardener, …) | `kv/data/disinto/bots/<name>/*` + shared forge URL |
| `runner-<TOKEN>.hcl` | Per-secret policy for vault-runner ephemeral dispatch | exactly one `kv/data/disinto/runner/<TOKEN>` path |
| `dispatcher.hcl` | Long-running edge dispatcher | `kv/data/disinto/runner/*` + `kv/data/disinto/shared/ops-repo/*` |
The KV mount name `kv/` is the convention this migration uses (mounted as
KV v2). Vault addresses KV v2 data at `kv/data/<path>` and metadata at
`kv/metadata/<path>` — policies that need `list` always target the
`metadata` path; reads target `data`.
## Policy → KV path summary
| Policy | Reads |
|---|---|
| `service-forgejo` | `kv/data/disinto/shared/forgejo/*` |
| `service-woodpecker` | `kv/data/disinto/shared/woodpecker/*` |
| `bot-<role>` (dev, review, gardener, architect, planner, predictor, supervisor, vault, dev-qwen) | `kv/data/disinto/bots/<role>/*` + `kv/data/disinto/shared/forge/*` |
| `runner-<TOKEN>` (GITHUB\_TOKEN, CODEBERG\_TOKEN, CLAWHUB\_TOKEN, DEPLOY\_KEY, NPM\_TOKEN, DOCKER\_HUB\_TOKEN) | `kv/data/disinto/runner/<TOKEN>` (exactly one) |
| `dispatcher` | `kv/data/disinto/runner/*` + `kv/data/disinto/shared/ops-repo/*` |
## Why one policy per runner secret
`vault-runner` (Step 5) reads each action TOML's `secrets = [...]` list
and composes only those `runner-<NAME>` policies onto the per-dispatch
ephemeral token. Wildcards or batched policies would hand the runner more
secrets than the action declared — defeats AD-006 (least-privilege per
external action). Adding a new declarable secret = adding one new
`runner-<NAME>.hcl` here + extending the SECRETS allow-list in vault-action
validation.
## Adding a new policy
1. Drop a file matching one of the four naming patterns above. Use an
existing file in the same family as the template — comment header,
capability list, and KV path layout should match the family.
2. Run `vault policy fmt <file>` locally so the formatting matches what
the CI fmt-check (step 4 of `.woodpecker/nomad-validate.yml`) will
accept. The fmt check runs non-destructively in CI but a dirty file
fails the step; running `fmt` locally before pushing is the fastest
path.
3. Add the matching entry to `../roles.yaml` (see "JWT-auth roles" below)
so the CI role-reference check (step 6) stays green.
4. Run `tools/vault-apply-policies.sh --dry-run` to confirm the new
basename appears in the planned-work list with the expected SHA.
5. Run `tools/vault-apply-policies.sh` against a Vault instance to
create it; re-run to confirm it reports `unchanged`.
## JWT-auth roles (S2.3)
Policies are inert until a Vault token carrying them is minted. In this
migration that mint path is JWT auth — Nomad jobs exchange their
workload-identity JWT for a Vault token via
`auth/jwt-nomad/role/<name>``token_policies = ["<policy>"]`. The
role bindings live in [`../roles.yaml`](../roles.yaml); the script that
enables the auth method + writes the config + applies roles is
[`lib/init/nomad/vault-nomad-auth.sh`](../../lib/init/nomad/vault-nomad-auth.sh).
The applier is [`tools/vault-apply-roles.sh`](../../tools/vault-apply-roles.sh).
### Role → policy naming convention
Role name == policy name, 1:1. `vault/roles.yaml` carries one entry per
`vault/policies/*.hcl` file:
```yaml
roles:
- name: service-forgejo # Vault role
policy: service-forgejo # ACL policy attached to minted tokens
namespace: default # bound_claims.nomad_namespace
job_id: forgejo # bound_claims.nomad_job_id
```
The role name is what jobspecs reference via `vault { role = "..." }`
keep it identical to the policy basename so an S2.1↔S2.3 drift (new
policy without a role, or vice versa) shows up in one directory review,
not as a runtime "permission denied" at job placement.
`bound_claims.nomad_job_id` is the actual `job "..."` name in the
jobspec, which may differ from the policy name (e.g. policy
`service-forgejo` binds to job `forgejo`). Update it when each bot's or
runner's jobspec lands.
### Adding a new service
1. Write `vault/policies/<name>.hcl` using the naming-table family that
fits (`service-`, `bot-`, `runner-`, or standalone).
2. Add a matching entry to `vault/roles.yaml` with all four fields
(`name`, `policy`, `namespace`, `job_id`).
3. Apply both — either in one shot via `lib/init/nomad/vault-nomad-auth.sh`
(policies → roles → nomad SIGHUP), or granularly via
`tools/vault-apply-policies.sh` + `tools/vault-apply-roles.sh`.
4. Reference the role in the consuming jobspec's `vault { role = "<name>" }`.
### Token shape
All roles share the same token shape, hardcoded in
`tools/vault-apply-roles.sh`:
| Field | Value |
|---|---|
| `bound_audiences` | `["vault.io"]` — matches `default_identity.aud` in `nomad/server.hcl` |
| `token_type` | `service` — auto-revoked when the task exits |
| `token_ttl` | `1h` |
| `token_max_ttl` | `24h` |
Bumping any of these is a knowing, repo-wide change. Per-role overrides
would let one service's tokens outlive the others — add a field to
`vault/roles.yaml` and the applier at the same time if that ever
becomes necessary.
## Policy lifecycle
Adding a policy that an actual workload consumes is a three-step chain;
the CI pipeline guards each link.
1. **Add the policy HCL**`vault/policies/<name>.hcl`, formatted with
`vault policy fmt`. Capabilities must be drawn from the Vault-recognized
set (`read`, `list`, `create`, `update`, `delete`, `patch`, `sudo`,
`deny`); a typo fails CI step 5 (HCL written to an inline dev-mode Vault
via `vault policy write` — a real parser, not a regex).
2. **Update `../roles.yaml`** — add a JWT-auth role entry whose `policy:`
field matches the new basename (without `.hcl`). CI step 6 re-checks
every role in this file against the policy set, so a drift between the
two directories fails the step.
3. **Reference from a Nomad jobspec** — add `vault { role = "<name>" }` in
`nomad/jobs/<service>.hcl` (owned by S2.4). Policies do not take effect
until a Nomad job asks for a token via that role.
See the "Adding a new service" walkthrough below for the applier-script
flow once steps 13 are committed.
## CI enforcement (`.woodpecker/nomad-validate.yml`)
The pipeline triggers on any PR touching `vault/policies/**`,
`vault/roles.yaml`, or `lib/init/nomad/vault-*.sh` and runs four
vault-scoped checks (in addition to the nomad-scoped steps already in
place):
| Step | Tool | What it catches |
|---|---|---|
| 4. `vault-policy-fmt` | `vault policy fmt` + `diff` | formatting drift — trailing whitespace, wrong indentation, missing newlines |
| 5. `vault-policy-validate` | `vault policy write` against inline dev Vault | HCL syntax errors, unknown stanzas, invalid capability names (e.g. `"frobnicate"`), malformed `path "..." {}` blocks |
| 6. `vault-roles-validate` | yamllint + PyYAML | roles.yaml syntax drift, missing required fields, role→policy references with no matching `.hcl` |
| P11 | `lib/secret-scan.sh` via `.woodpecker/secret-scan.yml` | literal secret leaked into a policy HCL (rare copy-paste mistake) — already covers `vault/**/*`, no duplicate step here |
All four steps are fail-closed — any error blocks merge. The pipeline
pins `hashicorp/vault:1.18.5` (matching `lib/init/nomad/install.sh`);
bumping the runtime version without bumping the CI image is a CI-caught
drift.
## Common failure modes
| Symptom in CI logs | Root cause | Fix |
|---|---|---|
| `vault-policy-fmt: … is not formatted — run 'vault policy fmt <file>'` | Trailing whitespace / mixed indent in an HCL file | `vault policy fmt <file>` locally and re-commit |
| `vault-policy-validate: … failed validation` plus a `policy` error from Vault | Unknown capability (e.g. `"frobnicate"`), unknown stanza, malformed `path` block | Fix the HCL; valid capabilities are `read`, `list`, `create`, `update`, `delete`, `patch`, `sudo`, `deny` |
| `vault-roles-validate: ERROR: role 'X' references policy 'Y' but vault/policies/Y.hcl does not exist` | A role's `policy:` field does not match any file basename in `vault/policies/` | Either add the missing policy HCL or fix the typo in `roles.yaml` |
| `vault-roles-validate: ERROR: role entry missing required field 'Z'` | A role in `roles.yaml` is missing one of `name`, `policy`, `namespace`, `job_id` | Add the field; all four are required |
| P11 `secret-scan: detected potential secret …` on a `.hcl` file | A literal token/password was pasted into a policy | Policies must name KV paths, not carry secret values — move the literal into KV (S2.2) and have the policy grant `read` on the path |
## What this directory does NOT own
- **Attaching policies to Nomad jobs.** That's S2.4 (#882) via the
jobspec `template { vault { policies = […] } }` stanza — the role
name in `vault { role = "..." }` is what binds the policy.
- **Writing the secret values themselves.** That's S2.2 (#880) via
`tools/vault-import.sh`.

View file

@ -1,16 +0,0 @@
# vault/policies/bot-architect.hcl
#
# Architect agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the architect-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/architect/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/architect/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -1,18 +0,0 @@
# vault/policies/bot-dev-qwen.hcl
#
# Local-Qwen dev agent (agents-llama profile): reads its own bot KV
# namespace + the shared forge URL. Attached to the dev-qwen Nomad job
# via workload identity (S2.4). KV path mirrors the bot basename:
# kv/disinto/bots/dev-qwen/*.
path "kv/data/disinto/bots/dev-qwen/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/dev-qwen/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -1,16 +0,0 @@
# vault/policies/bot-dev.hcl
#
# Dev agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the dev-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/dev/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/dev/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -1,16 +0,0 @@
# vault/policies/bot-gardener.hcl
#
# Gardener agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the gardener-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/gardener/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/gardener/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -1,16 +0,0 @@
# vault/policies/bot-planner.hcl
#
# Planner agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the planner-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/planner/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/planner/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -1,16 +0,0 @@
# vault/policies/bot-predictor.hcl
#
# Predictor agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the predictor-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/predictor/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/predictor/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -1,16 +0,0 @@
# vault/policies/bot-review.hcl
#
# Review agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the review-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/review/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/review/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -1,16 +0,0 @@
# vault/policies/bot-supervisor.hcl
#
# Supervisor agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the supervisor-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/supervisor/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/supervisor/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -1,20 +0,0 @@
# vault/policies/bot-vault.hcl
#
# Vault agent (the legacy edge dispatcher / vault-action runner): reads its
# own bot KV namespace + the shared forge URL. Attached to the vault-agent
# Nomad job via workload identity (S2.4).
#
# NOTE: distinct from the runner-* policies, which gate per-secret access
# for vault-runner ephemeral dispatches (Step 5).
path "kv/data/disinto/bots/vault/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/vault/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -1,29 +0,0 @@
# vault/policies/dispatcher.hcl
#
# Edge dispatcher policy: needs to enumerate the runner secret namespace
# (to check secret presence before dispatching) and read the shared
# ops-repo credentials (token + clone URL) it uses to fetch action TOMLs.
#
# Scope:
# - kv/disinto/runner/* read all per-secret values + list keys
# - kv/disinto/shared/ops-repo/* read the ops-repo creds bundle
#
# The actual ephemeral runner container created per dispatch gets the
# narrow runner-<NAME> policies, NOT this one. This policy stays bound
# to the long-running dispatcher only.
path "kv/data/disinto/runner/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/runner/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/ops-repo/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/shared/ops-repo/*" {
capabilities = ["list", "read"]
}

View file

@ -1,10 +0,0 @@
# vault/policies/runner-CLAWHUB_TOKEN.hcl
#
# Per-secret runner policy: ClawHub token for skill-registry publish.
# vault-runner (Step 5) composes only the runner-* policies named by the
# dispatching action's `secrets = [...]` list, so this policy intentionally
# scopes a single KV path no wildcards, no list capability.
path "kv/data/disinto/runner/CLAWHUB_TOKEN" {
capabilities = ["read"]
}

View file

@ -1,10 +0,0 @@
# vault/policies/runner-CODEBERG_TOKEN.hcl
#
# Per-secret runner policy: Codeberg PAT for upstream-repo mirror push.
# vault-runner (Step 5) composes only the runner-* policies named by the
# dispatching action's `secrets = [...]` list, so this policy intentionally
# scopes a single KV path no wildcards, no list capability.
path "kv/data/disinto/runner/CODEBERG_TOKEN" {
capabilities = ["read"]
}

View file

@ -1,10 +0,0 @@
# vault/policies/runner-DEPLOY_KEY.hcl
#
# Per-secret runner policy: SSH deploy key for git push to a release target.
# vault-runner (Step 5) composes only the runner-* policies named by the
# dispatching action's `secrets = [...]` list, so this policy intentionally
# scopes a single KV path no wildcards, no list capability.
path "kv/data/disinto/runner/DEPLOY_KEY" {
capabilities = ["read"]
}

View file

@ -1,10 +0,0 @@
# vault/policies/runner-DOCKER_HUB_TOKEN.hcl
#
# Per-secret runner policy: Docker Hub access token for image push.
# vault-runner (Step 5) composes only the runner-* policies named by the
# dispatching action's `secrets = [...]` list, so this policy intentionally
# scopes a single KV path no wildcards, no list capability.
path "kv/data/disinto/runner/DOCKER_HUB_TOKEN" {
capabilities = ["read"]
}

View file

@ -1,10 +0,0 @@
# vault/policies/runner-GITHUB_TOKEN.hcl
#
# Per-secret runner policy: GitHub PAT for cross-mirror push / API calls.
# vault-runner (Step 5) composes only the runner-* policies named by the
# dispatching action's `secrets = [...]` list, so this policy intentionally
# scopes a single KV path no wildcards, no list capability.
path "kv/data/disinto/runner/GITHUB_TOKEN" {
capabilities = ["read"]
}

View file

@ -1,10 +0,0 @@
# vault/policies/runner-NPM_TOKEN.hcl
#
# Per-secret runner policy: npm registry auth token for package publish.
# vault-runner (Step 5) composes only the runner-* policies named by the
# dispatching action's `secrets = [...]` list, so this policy intentionally
# scopes a single KV path no wildcards, no list capability.
path "kv/data/disinto/runner/NPM_TOKEN" {
capabilities = ["read"]
}

View file

@ -1,15 +0,0 @@
# vault/policies/service-forgejo.hcl
#
# Read-only access to shared Forgejo secrets (admin password, OAuth client
# config). Attached to the Forgejo Nomad job via workload identity (S2.4).
#
# Scope: kv/disinto/shared/forgejo entries owned by the operator and
# shared between forgejo + the chat OAuth client (issue #855 lineage).
path "kv/data/disinto/shared/forgejo" {
capabilities = ["read"]
}
path "kv/metadata/disinto/shared/forgejo" {
capabilities = ["list", "read"]
}

View file

@ -1,15 +0,0 @@
# vault/policies/service-woodpecker.hcl
#
# Read-only access to shared Woodpecker secrets (agent secret, forge OAuth
# client). Attached to the Woodpecker Nomad job via workload identity (S2.4).
#
# Scope: kv/disinto/shared/woodpecker/* entries owned by the operator
# and consumed by woodpecker-server + woodpecker-agent.
path "kv/data/disinto/shared/woodpecker/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/shared/woodpecker/*" {
capabilities = ["list", "read"]
}

View file

@ -1,155 +0,0 @@
# =============================================================================
# vault/roles.yaml — Vault JWT-auth role bindings for Nomad workload identity
#
# Part of the Nomad+Vault migration (S2.3, issue #881). One entry per
# vault/policies/*.hcl policy. Each entry pairs:
#
# - the Vault role name (what a Nomad job references via
# `vault { role = "..." }` in its jobspec), with
# - the ACL policy attached to tokens it mints, and
# - the bound claims that gate which Nomad workloads may authenticate
# through that role (prevents a jobspec named "woodpecker" from
# asking for role "service-forgejo").
#
# The source of truth for *what* secrets each role's token can read is
# vault/policies/<policy>.hcl. This file only wires role→policy→claims.
# Keeping the two side-by-side in the repo means an S2.1↔S2.3 drift
# (new policy without a role, or vice versa) shows up in one directory
# review, not as a runtime "permission denied" at job placement.
#
# All roles share the same constants (hardcoded in tools/vault-apply-roles.sh):
# - bound_audiences = ["vault.io"] — Nomad's default workload-identity aud
# - token_type = "service" — revoked when task exits
# - token_ttl = "1h" — token lifetime
# - token_max_ttl = "24h" — hard cap across renewals
#
# Format (strict — parsed line-by-line by tools/vault-apply-roles.sh with
# awk; keep the "- name:" prefix + two-space nested indent exactly as
# shown below):
#
# roles:
# - name: <vault-role-name> # path: auth/jwt-nomad/role/<name>
# policy: <acl-policy-name> # must match vault/policies/<name>.hcl
# namespace: <nomad-namespace> # bound_claims.nomad_namespace
# job_id: <nomad-job-id> # bound_claims.nomad_job_id
#
# All four fields are required. Comments (#) and blank lines are ignored.
#
# Adding a new role:
# 1. Land the companion vault/policies/<name>.hcl in S2.1 style.
# 2. Add a block here with all four fields.
# 3. Run tools/vault-apply-roles.sh to upsert it.
# 4. Re-run to confirm "role <name> unchanged".
# =============================================================================
roles:
# ── Long-running services (nomad/jobs/<name>.hcl) ──────────────────────────
# The jobspec's nomad job name is the bound job_id, e.g. `job "forgejo"`
# in nomad/jobs/forgejo.hcl → job_id: forgejo. The policy name stays
# `service-<name>` so the directory layout under vault/policies/ groups
# platform services under a single prefix.
- name: service-forgejo
policy: service-forgejo
namespace: default
job_id: forgejo
- name: service-woodpecker
policy: service-woodpecker
namespace: default
job_id: woodpecker-server
- name: service-woodpecker-agent
policy: service-woodpecker
namespace: default
job_id: woodpecker-agent
# ── Per-agent bots (nomad/jobs/bot-<role>.hcl — land in later steps) ───────
# job_id placeholders match the policy name 1:1 until each bot's jobspec
# lands. When a bot's jobspec is added under nomad/jobs/, update the
# corresponding job_id here to match the jobspec's `job "<name>"` — and
# CI's S2.6 roles.yaml check will confirm the pairing.
- name: bot-dev
policy: bot-dev
namespace: default
job_id: bot-dev
- name: bot-dev-qwen
policy: bot-dev-qwen
namespace: default
job_id: bot-dev-qwen
- name: bot-review
policy: bot-review
namespace: default
job_id: bot-review
- name: bot-gardener
policy: bot-gardener
namespace: default
job_id: bot-gardener
- name: bot-planner
policy: bot-planner
namespace: default
job_id: bot-planner
- name: bot-predictor
policy: bot-predictor
namespace: default
job_id: bot-predictor
- name: bot-supervisor
policy: bot-supervisor
namespace: default
job_id: bot-supervisor
- name: bot-architect
policy: bot-architect
namespace: default
job_id: bot-architect
- name: bot-vault
policy: bot-vault
namespace: default
job_id: bot-vault
# ── Edge dispatcher ────────────────────────────────────────────────────────
- name: dispatcher
policy: dispatcher
namespace: default
job_id: dispatcher
# ── Per-secret runner roles ────────────────────────────────────────────────
# vault-runner (Step 5) composes runner-<NAME> policies onto each
# ephemeral dispatch token based on the action TOML's `secrets = [...]`.
# The per-dispatch runner jobspec job_id follows the same `runner-<NAME>`
# convention (one jobspec per secret, minted per dispatch) so the bound
# claim matches the role name directly.
- name: runner-GITHUB_TOKEN
policy: runner-GITHUB_TOKEN
namespace: default
job_id: runner-GITHUB_TOKEN
- name: runner-CODEBERG_TOKEN
policy: runner-CODEBERG_TOKEN
namespace: default
job_id: runner-CODEBERG_TOKEN
- name: runner-CLAWHUB_TOKEN
policy: runner-CLAWHUB_TOKEN
namespace: default
job_id: runner-CLAWHUB_TOKEN
- name: runner-DEPLOY_KEY
policy: runner-DEPLOY_KEY
namespace: default
job_id: runner-DEPLOY_KEY
- name: runner-NPM_TOKEN
policy: runner-NPM_TOKEN
namespace: default
job_id: runner-NPM_TOKEN
- name: runner-DOCKER_HUB_TOKEN
policy: runner-DOCKER_HUB_TOKEN
namespace: default
job_id: runner-DOCKER_HUB_TOKEN