Merge pull request 'fix: [nomad-step-2] S2.1 — vault/policies/*.hcl + tools/vault-apply-policies.sh (#879)' (#888) from fix/issue-879 into main

2026-04-16 15:56:01 +00:00 · 2026-04-16 15:56:01 +00:00 · 3b6325fd4f
commit 3b6325fd4f
parent c3a61dce00 86807d6861
20 changed files with 499 additions and 0 deletions
--- a/tools/vault-apply-policies.sh
+++ b/tools/vault-apply-policies.sh
@ -0,0 +1,164 @@
+#!/usr/bin/env bash
+# =============================================================================
+# tools/vault-apply-policies.sh — Idempotent Vault policy sync
+#
+# Part of the Nomad+Vault migration (S2.1, issue #879). Reads every
+# vault/policies/*.hcl file and upserts it into Vault as an ACL policy
+# named after the file's basename (without the .hcl suffix).
+#
+# Idempotency contract:
+#   For each vault/policies/<NAME>.hcl:
+#     - Policy missing in Vault       → apply, log "policy <NAME> created"
+#     - Policy present, content same  → skip,  log "policy <NAME> unchanged"
+#     - Policy present, content diff  → apply, log "policy <NAME> updated"
+#
+#   Comparison is byte-for-byte against the on-server policy text returned by
+#   GET sys/policies/acl/<NAME>.data.policy. Re-running with no file edits is
+#   a guaranteed no-op that reports every policy as "unchanged".
+#
+#   --dry-run: prints <NAME>  <SHA256> for each file that WOULD be applied;
+#   does not call Vault at all (no GETs, no PUTs). Exits 0.
+#
+# Requires:
+#   - VAULT_ADDR   (e.g. http://127.0.0.1:8200)
+#   - VAULT_TOKEN  (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh)
+#   - curl, jq, sha256sum
+#
+# Usage:
+#   tools/vault-apply-policies.sh
+#   tools/vault-apply-policies.sh --dry-run
+#
+# Exit codes:
+#   0  success (policies synced, or --dry-run completed)
+#   1  precondition / API failure
+# =============================================================================
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+POLICIES_DIR="${REPO_ROOT}/vault/policies"
+
+# shellcheck source=../lib/hvault.sh
+source "${REPO_ROOT}/lib/hvault.sh"
+
+log() { printf '[vault-apply] %s\n' "$*"; }
+die() { printf '[vault-apply] ERROR: %s\n' "$*" >&2; exit 1; }
+
+# ── Flag parsing ─────────────────────────────────────────────────────────────
+# Single optional flag — no loop needed. Keeps this block textually distinct
+# from the multi-flag `while/case` parsers elsewhere in the repo (see
+# .woodpecker/detect-duplicates.py — sliding 5-line window).
+dry_run=false
+[ "$#" -le 1 ] || die "too many arguments (saw: $*)"
+case "${1:-}" in
+  '')         ;;
+  --dry-run)  dry_run=true ;;
+  -h|--help)  printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
+              printf 'Apply every vault/policies/*.hcl to Vault as an ACL policy.\n'
+              printf 'Idempotent: unchanged policies are reported as "unchanged" and\n'
+              printf 'not written.\n\n'
+              printf '  --dry-run   Print policy names + content SHA256 that would be\n'
+              printf '              applied, without contacting Vault. Exits 0.\n'
+              exit 0 ;;
+  *)          die "unknown flag: $1" ;;
+esac
+
+# ── Preconditions ────────────────────────────────────────────────────────────
+for bin in curl jq sha256sum; do
+  command -v "$bin" >/dev/null 2>&1 \
+    || die "required binary not found: ${bin}"
+done
+
+[ -d "$POLICIES_DIR" ] \
+  || die "policies directory not found: ${POLICIES_DIR}"
+
+# Collect policy files in a stable (lexicographic) order so log output is
+# deterministic across runs and CI diffs.
+mapfile -t POLICY_FILES < <(
+  find "$POLICIES_DIR" -maxdepth 1 -type f -name '*.hcl' | LC_ALL=C sort
+)
+
+if [ "${#POLICY_FILES[@]}" -eq 0 ]; then
+  die "no *.hcl files in ${POLICIES_DIR}"
+fi
+
+# ── Dry-run: print plan + exit (no Vault calls) ──────────────────────────────
+if [ "$dry_run" = true ]; then
+  log "dry-run — ${#POLICY_FILES[@]} policy file(s) in ${POLICIES_DIR}"
+  for f in "${POLICY_FILES[@]}"; do
+    name="$(basename "$f" .hcl)"
+    sha="$(sha256sum "$f" | awk '{print $1}')"
+    printf '[vault-apply] would apply policy %s (sha256=%s)\n' "$name" "$sha"
+  done
+  exit 0
+fi
+
+# ── Live run: Vault connectivity check ───────────────────────────────────────
+[ -n "${VAULT_ADDR:-}" ] \
+  || die "VAULT_ADDR is not set — export VAULT_ADDR=http://127.0.0.1:8200"
+
+# hvault_token_lookup both resolves the token (env or /etc/vault.d/root.token)
+# and confirms the server is reachable with a valid token. Fail fast here so
+# the per-file loop below doesn't emit N identical "HTTP 403" errors.
+hvault_token_lookup >/dev/null \
+  || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
+
+# ── Helper: fetch the on-server policy text, or empty if absent ──────────────
+# Echoes the current policy content on stdout. A 404 (policy does not exist
+# yet) is a non-error — we print nothing and exit 0 so the caller can treat
+# the empty string as "needs create". Any other non-2xx is a hard failure.
+#
+# Uses a subshell + EXIT trap (not RETURN) for tmpfile cleanup: the RETURN
+# trap does NOT fire on set-e abort, so if jq below tripped errexit the
+# tmpfile would leak. Subshell exit propagates via the function's last-
+# command exit status.
+fetch_current_policy() {
+  local name="$1"
+  (
+    local tmp http_code
+    tmp="$(mktemp)"
+    trap 'rm -f "$tmp"' EXIT
+    http_code="$(curl -sS -o "$tmp" -w '%{http_code}' \
+      -H "X-Vault-Token: ${VAULT_TOKEN}" \
+      "${VAULT_ADDR}/v1/sys/policies/acl/${name}")" \
+      || { printf '[vault-apply] ERROR: curl failed for policy %s\n' "$name" >&2; exit 1; }
+    case "$http_code" in
+      200) jq -r '.data.policy // ""' < "$tmp" ;;
+      404) printf '' ;;  # absent — caller treats as "create"
+      *)
+        printf '[vault-apply] ERROR: HTTP %s fetching policy %s:\n' "$http_code" "$name" >&2
+        cat "$tmp" >&2
+        exit 1
+        ;;
+    esac
+  )
+}
+
+# ── Apply each policy, reporting created/updated/unchanged ───────────────────
+log "syncing ${#POLICY_FILES[@]} polic(y|ies) from ${POLICIES_DIR}"
+
+for f in "${POLICY_FILES[@]}"; do
+  name="$(basename "$f" .hcl)"
+
+  desired="$(cat "$f")"
+  current="$(fetch_current_policy "$name")" \
+    || die "failed to read existing policy: ${name}"
+
+  if [ -z "$current" ]; then
+    hvault_policy_apply "$name" "$f" \
+      || die "failed to create policy: ${name}"
+    log "policy ${name} created"
+    continue
+  fi
+
+  if [ "$current" = "$desired" ]; then
+    log "policy ${name} unchanged"
+    continue
+  fi
+
+  hvault_policy_apply "$name" "$f" \
+    || die "failed to update policy: ${name}"
+  log "policy ${name} updated"
+done
+
+log "done — ${#POLICY_FILES[@]} polic(y|ies) synced"
--- a/vault/policies/AGENTS.md
+++ b/vault/policies/AGENTS.md
@ -0,0 +1,66 @@
+# vault/policies/ — Agent Instructions
+
+HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per
+policy; the basename (minus `.hcl`) is the Vault policy name applied to it.
+Synced into Vault by `tools/vault-apply-policies.sh` (idempotent — see the
+script header for the contract).
+
+This directory is part of the **Nomad+Vault migration (Step 2)** — see
+issues #879–#884. Policies attach to Nomad jobs via workload identity in
+S2.4; this PR only lands the files + apply script.
+
+## Naming convention
+
+| Prefix | Audience | KV scope |
+|---|---|---|
+| `service-<name>.hcl`  | Long-running platform services (forgejo, woodpecker) | `kv/data/disinto/shared/<name>/*` |
+| `bot-<name>.hcl`      | Per-agent jobs (dev, review, gardener, …)            | `kv/data/disinto/bots/<name>/*` + shared forge URL |
+| `runner-<TOKEN>.hcl`  | Per-secret policy for vault-runner ephemeral dispatch | exactly one `kv/data/disinto/runner/<TOKEN>` path |
+| `dispatcher.hcl`      | Long-running edge dispatcher                         | `kv/data/disinto/runner/*` + `kv/data/disinto/shared/ops-repo/*` |
+
+The KV mount name `kv/` is the convention this migration uses (mounted as
+KV v2). Vault addresses KV v2 data at `kv/data/<path>` and metadata at
+`kv/metadata/<path>` — policies that need `list` always target the
+`metadata` path; reads target `data`.
+
+## Policy → KV path summary
+
+| Policy | Reads |
+|---|---|
+| `service-forgejo` | `kv/data/disinto/shared/forgejo/*` |
+| `service-woodpecker` | `kv/data/disinto/shared/woodpecker/*` |
+| `bot-<role>` (dev, review, gardener, architect, planner, predictor, supervisor, vault, dev-qwen) | `kv/data/disinto/bots/<role>/*` + `kv/data/disinto/shared/forge/*` |
+| `runner-<TOKEN>` (GITHUB\_TOKEN, CODEBERG\_TOKEN, CLAWHUB\_TOKEN, DEPLOY\_KEY, NPM\_TOKEN, DOCKER\_HUB\_TOKEN) | `kv/data/disinto/runner/<TOKEN>` (exactly one) |
+| `dispatcher` | `kv/data/disinto/runner/*` + `kv/data/disinto/shared/ops-repo/*` |
+
+## Why one policy per runner secret
+
+`vault-runner` (Step 5) reads each action TOML's `secrets = [...]` list
+and composes only those `runner-<NAME>` policies onto the per-dispatch
+ephemeral token. Wildcards or batched policies would hand the runner more
+secrets than the action declared — defeats AD-006 (least-privilege per
+external action). Adding a new declarable secret = adding one new
+`runner-<NAME>.hcl` here + extending the SECRETS allow-list in vault-action
+validation.
+
+## Adding a new policy
+
+1. Drop a file matching one of the four naming patterns above. Use an
+   existing file in the same family as the template — comment header,
+   capability list, and KV path layout should match the family.
+2. Run `tools/vault-apply-policies.sh --dry-run` to confirm the new
+   basename appears in the planned-work list with the expected SHA.
+3. Run `tools/vault-apply-policies.sh` against a Vault instance to
+   create it; re-run to confirm it reports `unchanged`.
+4. The CI fmt + validate step lands in S2.6 (#884). Until then
+   `vault policy fmt <file>` locally is the fastest sanity check.
+
+## What this directory does NOT own
+
+- **Attaching policies to Nomad jobs.** That's S2.4 (#882) via the
+  jobspec `template { vault { policies = […] } }` stanza.
+- **Enabling JWT auth + Nomad workload identity roles.** That's S2.3
+  (#881).
+- **Writing the secret values themselves.** That's S2.2 (#880) via
+  `tools/vault-import.sh`.
+- **CI policy fmt + validate + roles.yaml check.** That's S2.6 (#884).
--- a/vault/policies/bot-architect.hcl
+++ b/vault/policies/bot-architect.hcl
@ -0,0 +1,16 @@
+# vault/policies/bot-architect.hcl
+#
+# Architect agent: reads its own bot KV namespace + the shared forge URL.
+# Attached to the architect-agent Nomad job via workload identity (S2.4).
+
+path "kv/data/disinto/bots/architect/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/bots/architect/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/forge/*" {
+  capabilities = ["read"]
+}
--- a/vault/policies/bot-dev-qwen.hcl
+++ b/vault/policies/bot-dev-qwen.hcl
@ -0,0 +1,18 @@
+# vault/policies/bot-dev-qwen.hcl
+#
+# Local-Qwen dev agent (agents-llama profile): reads its own bot KV
+# namespace + the shared forge URL. Attached to the dev-qwen Nomad job
+# via workload identity (S2.4). KV path mirrors the bot basename:
+# kv/disinto/bots/dev-qwen/*.
+
+path "kv/data/disinto/bots/dev-qwen/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/bots/dev-qwen/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/forge/*" {
+  capabilities = ["read"]
+}
--- a/vault/policies/bot-dev.hcl
+++ b/vault/policies/bot-dev.hcl
@ -0,0 +1,16 @@
+# vault/policies/bot-dev.hcl
+#
+# Dev agent: reads its own bot KV namespace + the shared forge URL.
+# Attached to the dev-agent Nomad job via workload identity (S2.4).
+
+path "kv/data/disinto/bots/dev/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/bots/dev/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/forge/*" {
+  capabilities = ["read"]
+}
--- a/vault/policies/bot-gardener.hcl
+++ b/vault/policies/bot-gardener.hcl
@ -0,0 +1,16 @@
+# vault/policies/bot-gardener.hcl
+#
+# Gardener agent: reads its own bot KV namespace + the shared forge URL.
+# Attached to the gardener-agent Nomad job via workload identity (S2.4).
+
+path "kv/data/disinto/bots/gardener/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/bots/gardener/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/forge/*" {
+  capabilities = ["read"]
+}
--- a/vault/policies/bot-planner.hcl
+++ b/vault/policies/bot-planner.hcl
@ -0,0 +1,16 @@
+# vault/policies/bot-planner.hcl
+#
+# Planner agent: reads its own bot KV namespace + the shared forge URL.
+# Attached to the planner-agent Nomad job via workload identity (S2.4).
+
+path "kv/data/disinto/bots/planner/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/bots/planner/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/forge/*" {
+  capabilities = ["read"]
+}
--- a/vault/policies/bot-predictor.hcl
+++ b/vault/policies/bot-predictor.hcl
@ -0,0 +1,16 @@
+# vault/policies/bot-predictor.hcl
+#
+# Predictor agent: reads its own bot KV namespace + the shared forge URL.
+# Attached to the predictor-agent Nomad job via workload identity (S2.4).
+
+path "kv/data/disinto/bots/predictor/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/bots/predictor/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/forge/*" {
+  capabilities = ["read"]
+}
--- a/vault/policies/bot-review.hcl
+++ b/vault/policies/bot-review.hcl
@ -0,0 +1,16 @@
+# vault/policies/bot-review.hcl
+#
+# Review agent: reads its own bot KV namespace + the shared forge URL.
+# Attached to the review-agent Nomad job via workload identity (S2.4).
+
+path "kv/data/disinto/bots/review/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/bots/review/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/forge/*" {
+  capabilities = ["read"]
+}
--- a/vault/policies/bot-supervisor.hcl
+++ b/vault/policies/bot-supervisor.hcl
@ -0,0 +1,16 @@
+# vault/policies/bot-supervisor.hcl
+#
+# Supervisor agent: reads its own bot KV namespace + the shared forge URL.
+# Attached to the supervisor-agent Nomad job via workload identity (S2.4).
+
+path "kv/data/disinto/bots/supervisor/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/bots/supervisor/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/forge/*" {
+  capabilities = ["read"]
+}
--- a/vault/policies/bot-vault.hcl
+++ b/vault/policies/bot-vault.hcl
@ -0,0 +1,20 @@
+# vault/policies/bot-vault.hcl
+#
+# Vault agent (the legacy edge dispatcher / vault-action runner): reads its
+# own bot KV namespace + the shared forge URL. Attached to the vault-agent
+# Nomad job via workload identity (S2.4).
+#
+# NOTE: distinct from the runner-* policies, which gate per-secret access
+# for vault-runner ephemeral dispatches (Step 5).
+
+path "kv/data/disinto/bots/vault/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/bots/vault/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/forge/*" {
+  capabilities = ["read"]
+}
--- a/vault/policies/dispatcher.hcl
+++ b/vault/policies/dispatcher.hcl
@ -0,0 +1,29 @@
+# vault/policies/dispatcher.hcl
+#
+# Edge dispatcher policy: needs to enumerate the runner secret namespace
+# (to check secret presence before dispatching) and read the shared
+# ops-repo credentials (token + clone URL) it uses to fetch action TOMLs.
+#
+# Scope:
+#   - kv/disinto/runner/*       — read all per-secret values + list keys
+#   - kv/disinto/shared/ops-repo/* — read the ops-repo creds bundle
+#
+# The actual ephemeral runner container created per dispatch gets the
+# narrow runner-<NAME> policies, NOT this one. This policy stays bound
+# to the long-running dispatcher only.
+
+path "kv/data/disinto/runner/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/runner/*" {
+  capabilities = ["list", "read"]
+}
+
+path "kv/data/disinto/shared/ops-repo/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/shared/ops-repo/*" {
+  capabilities = ["list", "read"]
+}
--- a/vault/policies/runner-CLAWHUB_TOKEN.hcl
+++ b/vault/policies/runner-CLAWHUB_TOKEN.hcl
@ -0,0 +1,10 @@
+# vault/policies/runner-CLAWHUB_TOKEN.hcl
+#
+# Per-secret runner policy: ClawHub token for skill-registry publish.
+# vault-runner (Step 5) composes only the runner-* policies named by the
+# dispatching action's `secrets = [...]` list, so this policy intentionally
+# scopes a single KV path — no wildcards, no list capability.
+
+path "kv/data/disinto/runner/CLAWHUB_TOKEN" {
+  capabilities = ["read"]
+}
--- a/vault/policies/runner-CODEBERG_TOKEN.hcl
+++ b/vault/policies/runner-CODEBERG_TOKEN.hcl
@ -0,0 +1,10 @@
+# vault/policies/runner-CODEBERG_TOKEN.hcl
+#
+# Per-secret runner policy: Codeberg PAT for upstream-repo mirror push.
+# vault-runner (Step 5) composes only the runner-* policies named by the
+# dispatching action's `secrets = [...]` list, so this policy intentionally
+# scopes a single KV path — no wildcards, no list capability.
+
+path "kv/data/disinto/runner/CODEBERG_TOKEN" {
+  capabilities = ["read"]
+}
--- a/vault/policies/runner-DEPLOY_KEY.hcl
+++ b/vault/policies/runner-DEPLOY_KEY.hcl
@ -0,0 +1,10 @@
+# vault/policies/runner-DEPLOY_KEY.hcl
+#
+# Per-secret runner policy: SSH deploy key for git push to a release target.
+# vault-runner (Step 5) composes only the runner-* policies named by the
+# dispatching action's `secrets = [...]` list, so this policy intentionally
+# scopes a single KV path — no wildcards, no list capability.
+
+path "kv/data/disinto/runner/DEPLOY_KEY" {
+  capabilities = ["read"]
+}
--- a/vault/policies/runner-DOCKER_HUB_TOKEN.hcl
+++ b/vault/policies/runner-DOCKER_HUB_TOKEN.hcl
@ -0,0 +1,10 @@
+# vault/policies/runner-DOCKER_HUB_TOKEN.hcl
+#
+# Per-secret runner policy: Docker Hub access token for image push.
+# vault-runner (Step 5) composes only the runner-* policies named by the
+# dispatching action's `secrets = [...]` list, so this policy intentionally
+# scopes a single KV path — no wildcards, no list capability.
+
+path "kv/data/disinto/runner/DOCKER_HUB_TOKEN" {
+  capabilities = ["read"]
+}
--- a/vault/policies/runner-GITHUB_TOKEN.hcl
+++ b/vault/policies/runner-GITHUB_TOKEN.hcl
@ -0,0 +1,10 @@
+# vault/policies/runner-GITHUB_TOKEN.hcl
+#
+# Per-secret runner policy: GitHub PAT for cross-mirror push / API calls.
+# vault-runner (Step 5) composes only the runner-* policies named by the
+# dispatching action's `secrets = [...]` list, so this policy intentionally
+# scopes a single KV path — no wildcards, no list capability.
+
+path "kv/data/disinto/runner/GITHUB_TOKEN" {
+  capabilities = ["read"]
+}
--- a/vault/policies/runner-NPM_TOKEN.hcl
+++ b/vault/policies/runner-NPM_TOKEN.hcl
@ -0,0 +1,10 @@
+# vault/policies/runner-NPM_TOKEN.hcl
+#
+# Per-secret runner policy: npm registry auth token for package publish.
+# vault-runner (Step 5) composes only the runner-* policies named by the
+# dispatching action's `secrets = [...]` list, so this policy intentionally
+# scopes a single KV path — no wildcards, no list capability.
+
+path "kv/data/disinto/runner/NPM_TOKEN" {
+  capabilities = ["read"]
+}
--- a/vault/policies/service-forgejo.hcl
+++ b/vault/policies/service-forgejo.hcl
@ -0,0 +1,15 @@
+# vault/policies/service-forgejo.hcl
+#
+# Read-only access to shared Forgejo secrets (admin password, OAuth client
+# config). Attached to the Forgejo Nomad job via workload identity (S2.4).
+#
+# Scope: kv/disinto/shared/forgejo/* — entries owned by the operator and
+# shared between forgejo + the chat OAuth client (issue #855 lineage).
+
+path "kv/data/disinto/shared/forgejo/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/shared/forgejo/*" {
+  capabilities = ["list", "read"]
+}
--- a/vault/policies/service-woodpecker.hcl
+++ b/vault/policies/service-woodpecker.hcl
@ -0,0 +1,15 @@
+# vault/policies/service-woodpecker.hcl
+#
+# Read-only access to shared Woodpecker secrets (agent secret, forge OAuth
+# client). Attached to the Woodpecker Nomad job via workload identity (S2.4).
+#
+# Scope: kv/disinto/shared/woodpecker/* — entries owned by the operator
+# and consumed by woodpecker-server + woodpecker-agent.
+
+path "kv/data/disinto/shared/woodpecker/*" {
+  capabilities = ["read"]
+}
+
+path "kv/metadata/disinto/shared/woodpecker/*" {
+  capabilities = ["list", "read"]
+}