# =============================================================================
# .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts
#
# Part of the Nomad+Vault migration (S0.5, issue #825). Locks in the
# "no-ad-hoc-steps" principle: every HCL/shell artifact under nomad/ or
# lib/init/nomad/, plus the `disinto init` dispatcher, gets checked
# before it can land.
#
# Also includes Vault policy validation (S2.6, issue #884):
#   - vault policy fmt -check on vault/policies/*.hcl
#   - vault policy validate on each policy file
#   - roles.yaml validator (yamllint + policy reference check)
#   - secret-scan gate on policy files
#
# Triggers on PRs (and pushes) that touch any of:
#   nomad/**              — HCL configs (server, client, vault)
#   lib/init/nomad/**     — cluster-up / install / systemd / vault-init
#   bin/disinto           — `disinto init --backend=nomad` dispatcher
#   tests/disinto-init-nomad.bats — the bats suite itself
#   vault/policies/*.hcl  — Vault ACL policies (S2.6)
#   vault/roles.yaml      — JWT auth role definitions (S2.6)
#   lib/init/nomad/vault-*.sh — Vault init scripts (S2.6)
#   .woodpecker/nomad-validate.yml — the pipeline definition
#
# Steps (all fail-closed — any error blocks merge):
#   1. nomad-config-validate   — `nomad config validate` on server + client HCL
#   2. nomad-job-validate      — `nomad job validate` looped over every
#                                 nomad/jobs/*.hcl (new jobspecs get
#                                 CI coverage automatically)
#   3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl
#   4. vault-policy-fmt        — `vault policy fmt -check` on vault/policies/*.hcl
#   5. vault-policy-validate   — `vault policy validate` on each policy file
#   6. vault-roles-validate    — yamllint + policy reference check
#   7. vault-secret-scan       — scan policy files for embedded secrets
#   8. shellcheck-nomad        — shellcheck the cluster-up + install scripts + disinto
#   9. bats-init-nomad         — `disinto init --backend=nomad --dry-run` smoke tests
#
# Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 /
# vault 1.18.5). Bump there AND here together — drift = CI passing on
# syntax the runtime would reject.
# =============================================================================

when:
  - event: [push, pull_request]
    path:
      - "nomad/**"
      - "lib/init/nomad/**"
      - "bin/disinto"
      - "tests/disinto-init-nomad.bats"
      - "vault/policies/*.hcl"
      - "vault/roles.yaml"
      - "lib/init/nomad/vault-*.sh"
      - ".woodpecker/nomad-validate.yml"

# Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is
# configured with REQUIRE_SIGN_IN, so anonymous git clones fail (exit 128).
# FORGE_TOKEN is injected globally via WOODPECKER_ENVIRONMENT.
clone:
  git:
    image: alpine/git
    commands:
      - AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|")
      - git clone --depth 1 "$AUTH_URL" .
      - git fetch --depth 1 origin "$CI_COMMIT_REF"
      - git checkout FETCH_HEAD

steps:
  # ── 1. Nomad HCL syntax check ────────────────────────────────────────────
  # `nomad config validate` parses server.hcl + client.hcl and fails on any
  # HCL/semantic error (unknown block, invalid port range, bad driver cfg).
  # vault.hcl is excluded — it's a Vault config, not Nomad, so it goes
  # through the vault-operator-diagnose step instead.
  - name: nomad-config-validate
    image: hashicorp/nomad:1.9.5
    commands:
      - nomad version
      - nomad config validate nomad/server.hcl nomad/client.hcl

  # ── 2. Nomad jobspec HCL syntax check ────────────────────────────────────
  # `nomad job validate` is a *different* tool from `nomad config validate` —
  # the former parses jobspec HCL (job/group/task blocks, driver config,
  # volume refs, network ports), the latter parses agent config HCL
  # (server/client blocks). Running step 1 on a jobspec would reject it
  # with "unknown block 'job'", and vice versa. Hence two separate steps.
  #
  # Validation is offline: no running Nomad server is required (exit 0 on
  # valid HCL, 1 on syntax/semantic error). The CLI takes a single path
  # argument so we loop over every `*.hcl` file under nomad/jobs/ —
  # that way a new jobspec PR gets CI coverage automatically (no separate
  # "edit the pipeline" step to forget). The `.hcl` suffix is the naming
  # convention: anything else in nomad/jobs/ is deliberately not validated
  # by this step.
  #
  # `[ -f "$f" ]` guards against the no-match case: POSIX sh does not
  # nullglob, so an empty jobs/ directory would leave the literal glob in
  # "$f" and fail. Today forgejo.hcl exists, but the guard keeps the
  # step safe during any future transient empty state.
  #
  # Scope note: offline validate catches jobspec-level errors (unknown
  # stanzas, missing required fields, wrong value types, invalid driver
  # config). It does NOT resolve cross-file references like host_volume
  # source names against nomad/client.hcl — that mismatch surfaces at
  # scheduling time on the live cluster, not here. The paired-write rule
  # in nomad/AGENTS.md ("add to both client.hcl and cluster-up.sh") is the
  # primary guardrail for that class of drift.
  - name: nomad-job-validate
    image: hashicorp/nomad:1.9.5
    commands:
      - |
        set -e
        for f in nomad/jobs/*.hcl; do
          [ -f "$f" ] || continue
          echo "validating jobspec: $f"
          nomad job validate "$f"
        done

  # ── 3. Vault HCL syntax check ────────────────────────────────────────────
  # `vault operator diagnose` loads the config and runs a suite of checks.
  # Exit codes:
  #   0 — all checks green
  #   1 — at least one hard failure (bad HCL, bad schema, unreachable storage)
  #   2 — advisory warnings only (no hard failure)
  # Our factory dev-box vault.hcl deliberately runs TLS-disabled on a
  # localhost-only listener (documented in nomad/vault.hcl), which triggers
  # an advisory "Check Listener TLS" warning → exit 2. The config still
  # parses, so we tolerate exit 2 and fail only on exit 1 or crashes.
  # -skip=storage/-skip=listener disables the runtime-only checks (vault's
  # container has /vault/file so storage is fine, but explicit skip is cheap
  # insurance against future container-image drift).
  - name: vault-operator-diagnose
    image: hashicorp/vault:1.18.5
    commands:
      - |
        rc=0
        vault operator diagnose -config=nomad/vault.hcl -skip=storage -skip=listener || rc=$?
        case "$rc" in
          0) echo "vault config: all checks green" ;;
          2) echo "vault config: parse OK (rc=2 — advisory warnings only; TLS-disabled on localhost listener is by design)" ;;
          *) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;;
        esac

  # ── 4. Vault policy fmt -check ───────────────────────────────────────────
  # `vault policy fmt -check` is non-destructive; it reads each policy file
  # and compares against the formatted version. Any difference means the file
  # needs formatting. This enforces consistent indentation (2-space), no
  # trailing whitespace, and proper HCL formatting conventions.
  #
  # CI runs this BEFORE vault policy validate because unformatted HCL can
  # sometimes cause confusing validation errors.
  - name: vault-policy-fmt
    image: hashicorp/vault:1.18.5
    commands:
      - |
        set -e
        failed=0
        for f in vault/policies/*.hcl; do
          [ -f "$f" ] || continue
          echo "fmt-check: $f"
          if ! vault policy fmt -check "$f" > /dev/null 2>&1; then
            echo "  ERROR: $f is not formatted correctly" >&2
            vault policy fmt -check "$f" >&2 || true
            failed=1
          fi
        done
        if [ "$failed" -gt 0 ]; then
          echo "vault-policy-fmt: formatting errors found" >&2
          exit 1
        fi
        echo "vault-policy-fmt: all policies formatted correctly"

  # ── 5. Vault policy validate ─────────────────────────────────────────────
  # `vault policy validate` performs syntax + semantic validation:
  #   - Checks for unknown stanzas/blocks
  #   - Validates path patterns are valid
  #   - Validates capabilities are known (read, list, create, update, delete, sudo)
  #   - Checks for missing required fields
  #
  # Requires a running Vault instance (dev mode is sufficient for CI).
  # Uses the default dev server at http://127.0.0.1:8200 with "root" token.
  #
  # Exit codes:
  #   0 — policy is valid
  #   1 — policy has errors (syntax or semantic)
  #
  # CI starts a Vault dev server inline for validation.
  - name: vault-policy-validate
    image: hashicorp/vault:1.18.5
    commands:
      - |
        set -e
        # Start Vault dev server in background
        vault server -dev -dev-root-token-id=root -dev-listen-address=0.0.0.0:8200 &
        VAULT_PID=$!
        trap "kill $VAULT_PID 2>/dev/null || true" EXIT

        # Wait for Vault to be ready
        for i in $(seq 1 30); do
          if vault status > /dev/null 2>&1; then
            echo "vault-policy-validate: Vault is ready"
            break
          fi
          sleep 0.5
        done

        # Validate each policy
        failed=0
        for f in vault/policies/*.hcl; do
          [ -f "$f" ] || continue
          echo "validate: $f"
          if ! VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN=root vault policy validate "$f" > /dev/null 2>&1; then
            echo "  ERROR: $f validation failed" >&2
            VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN=root vault policy validate "$f" >&2 || true
            failed=1
          fi
        done

        if [ "$failed" -gt 0 ]; then
          echo "vault-policy-validate: validation errors found" >&2
          exit 1
        fi
        echo "vault-policy-validate: all policies valid"

  # ── 6. Vault roles.yaml validate ─────────────────────────────────────────
  # Validates vault/roles.yaml:
  #   1. yamllint check — ensures YAML syntax is valid
  #   2. Policy reference check — each role references a policy that exists
  #   3. Required fields check — each role has name, policies, and auth fields
  #
  # If roles.yaml doesn't exist yet, this step is skipped (it will be added
  # in S2.3 alongside JWT auth configuration).
  - name: vault-roles-validate
    image: python:3.12-alpine
    commands:
      - |
        set -e
        apk add --no-cache yamllint jq

        if [ ! -f vault/roles.yaml ]; then
          echo "vault-roles-validate: roles.yaml not found, skipping"
          exit 0
        fi

        echo "yamllint: vault/roles.yaml"
        yamllint -q vault/roles.yaml || {
          echo "  ERROR: yamllint found issues" >&2
          exit 1
        }
        echo "  OK"

        # Extract and validate policy references
        echo "policy-reference-check: validating policy references"
        policy_dir="vault/policies"
        failed=0

        # Get referenced policies from roles.yaml
        referenced=$(jq -r '.roles[].policies[]?' vault/roles.yaml 2>/dev/null | sort -u || true)

        if [ -z "$referenced" ]; then
          echo "vault-roles-validate: no policies referenced in roles.yaml" >&2
          exit 1
        fi

        # Get existing policy names
        existing=$(find "$policy_dir" -maxdepth 1 -name '*.hcl' -type f -exec basename {} .hcl \; | sort)

        for policy in $referenced; do
          if ! echo "$existing" | grep -q "^${policy}$"; then
            echo "vault-roles-validate: ERROR: policy '$policy' referenced but not found" >&2
            failed=1
          fi
        done

        if [ "$failed" -gt 0 ]; then
          echo "vault-roles-validate: policy reference errors found" >&2
          exit 1
        fi

        echo "vault-roles-validate: all policy references valid"

  # ── 7. Vault secret-scan ─────────────────────────────────────────────────
  # Scans policy HCL files for embedded secrets (rare but dangerous copy-paste
  # mistake). Uses the same patterns as lib/secret-scan.sh:
  #   - Long hex strings (32+ chars)
  #   - API key patterns
  #   - URLs with embedded credentials
  #   - Bearer tokens
  #
  # Environment variables like $TOKEN or ${TOKEN} are excluded as safe.
  - name: vault-secret-scan
    image: alpine:3.19
    commands:
      - |
        set -e
        apk add --no-cache bash

        # Copy the secret-scan.sh script into the container
        cat > /tmp/secret-scan.sh << 'EOF'
#!/usr/bin/env bash
# Inline version of lib/secret-scan.sh for CI secret detection

_SECRET_PATTERNS=(
  '[0-9a-fA-F]{32,}'
  'Bearer [A-Za-z0-9_/+=-]{20,}'
  '0x[0-9a-fA-F]{64}'
  'https?://[^[:space:]]*[0-9a-fA-F]{20,}'
  'AKIA[0-9A-Z]{16}'
  '(API_KEY|SECRET|TOKEN|PRIVATE_KEY|PASSWORD|INFURA|ALCHEMY)=[^[:space:]"]{16,}'
)

_SAFE_PATTERNS=(
  '\$\{?[A-Z_]+\}?'
  'commit [0-9a-f]{40}'
  'Merge [0-9a-f]{40}'
  'last-reviewed: [0-9a-f]{40}'
  'codeberg\.org/[^[:space:]]+'
  'localhost:3000/[^[:space:]]+'
  'SC[0-9]{4}'
)

scan_for_secrets() {
  local text="${1:-$(cat)}"
  local found=0

  local cleaned="$text"
  for safe in "${_SAFE_PATTERNS[@]}"; do
    cleaned=$(printf '%s' "$cleaned" | sed -E "s/${safe}/__SAFE__/g" 2>/dev/null || printf '%s' "$cleaned")
  done

  for pattern in "${_SECRET_PATTERNS[@]}"; do
    local matches
    matches=$(printf '%s' "$cleaned" | grep -oE "$pattern" 2>/dev/null || true)
    if [ -n "$matches" ]; then
      while IFS= read -r match; do
        [ "$match" = "__SAFE__" ] && continue
        [ -z "$match" ] && continue
        printf 'secret-scan: detected potential secret matching pattern [%s]: %s\n' \
          "$pattern" "${match:0:8}...${match: -4}" >&2
        found=1
      done <<< "$matches"
    fi
  done

  return $found
}
EOF
        chmod +x /tmp/secret-scan.sh

        # Scan policy files
        echo "secret-scan: vault/policies/*.hcl"
        failed=0
        for f in vault/policies/*.hcl; do
          [ -f "$f" ] || continue
          echo "  scanning: $f"
          if ! /tmp/secret-scan.sh < "$f"; then
            echo "  ERROR: potential secrets detected in $f" >&2
            failed=1
          fi
        done

        if [ "$failed" -gt 0 ]; then
          echo "vault-secret-scan: secrets detected" >&2
          exit 1
        fi
        echo "vault-secret-scan: no secrets detected"

  # ── 8. Shellcheck ────────────────────────────────────────────────────────
  # Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns
  # the backend dispatcher). bin/disinto has no .sh extension so the
  # repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the
  # one place it gets checked.
  - name: shellcheck-nomad
    image: koalaman/shellcheck-alpine:stable
    commands:
      - shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto

  # ── 9. bats: `disinto init --backend=nomad --dry-run` ────────────────────
  # Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0
  # with the expected step list, and --backend=docker stays on the docker
  # path (regression guard). Pure dry-run — no sudo, no network.
  - name: bats-init-nomad
    image: alpine:3.19
    commands:
      - apk add --no-cache bash bats
      - bats tests/disinto-init-nomad.bats