diff --git a/.env.example b/.env.example index c1c0b98..a1f24d5 100644 --- a/.env.example +++ b/.env.example @@ -32,13 +32,10 @@ FORGE_URL=http://localhost:3000 # [CONFIG] local Forgejo instance # - FORGE_PASS_DEV_QWEN2 # Name conversion: tr 'a-z-' 'A-Z_' (lowercase→UPPER, hyphens→underscores). # The compose generator looks these up via the agent's `forge_user` field in -# the project TOML. The pre-existing `dev-qwen` llama agent uses -# FORGE_TOKEN_LLAMA / FORGE_PASS_LLAMA (kept for backwards-compat with the -# legacy `ENABLE_LLAMA_AGENT=1` single-agent path). +# the project TOML. Configure local-model agents via [agents.X] sections in +# projects/*.toml — this is the canonical activation path. FORGE_TOKEN= # [SECRET] dev-bot API token (default for all agents) FORGE_PASS= # [SECRET] dev-bot password for git HTTP push (#361) -FORGE_TOKEN_LLAMA= # [SECRET] dev-qwen API token (for agents-llama) -FORGE_PASS_LLAMA= # [SECRET] dev-qwen password for git HTTP push FORGE_REVIEW_TOKEN= # [SECRET] review-bot API token FORGE_REVIEW_PASS= # [SECRET] review-bot password for git HTTP push FORGE_PLANNER_TOKEN= # [SECRET] planner-bot API token @@ -107,13 +104,6 @@ FORWARD_AUTH_SECRET= # [SECRET] Shared secret for Caddy ↔ # Store all project secrets here so formulas reference env vars, never hardcode. BASE_RPC_URL= # [SECRET] on-chain RPC endpoint -# ── Local Qwen dev agent (optional) ────────────────────────────────────── -# Set ENABLE_LLAMA_AGENT=1 to emit agents-llama in docker-compose.yml. -# Requires a running llama-server reachable at ANTHROPIC_BASE_URL. -# See docs/agents-llama.md for details. -ENABLE_LLAMA_AGENT=0 # [CONFIG] 1 = enable agents-llama service -ANTHROPIC_BASE_URL= # [CONFIG] e.g. http://host.docker.internal:8081 - # ── Tuning ──────────────────────────────────────────────────────────────── CLAUDE_TIMEOUT=7200 # [CONFIG] max seconds per Claude invocation diff --git a/.gitignore b/.gitignore index 21c6fbc..a29450c 100644 --- a/.gitignore +++ b/.gitignore @@ -20,7 +20,6 @@ metrics/supervisor-metrics.jsonl # OS .DS_Store dev/ci-fixes-*.json -gardener/dust.jsonl # Individual encrypted secrets (managed by disinto secrets add) secrets/ diff --git a/.woodpecker/detect-duplicates.py b/.woodpecker/detect-duplicates.py index 0485833..58fc160 100644 --- a/.woodpecker/detect-duplicates.py +++ b/.woodpecker/detect-duplicates.py @@ -294,6 +294,13 @@ def main() -> int: "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern", # Standard lib source block shared across formula-driven agent run scripts "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)", + # Common vault-seed script patterns: logging helpers + flag parsing + # Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh + "843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)", + "ee51df9642f2ef37af73b0c15f4d8406": "Logging helpers + DRY_RUN loop start (vault-seed-woodpecker + wp-oauth-register)", + "9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)", + "9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)", + "5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)", } if not sh_files: diff --git a/.woodpecker/nomad-validate.yml b/.woodpecker/nomad-validate.yml index 81e45ae..5a1cc7c 100644 --- a/.woodpecker/nomad-validate.yml +++ b/.woodpecker/nomad-validate.yml @@ -1,16 +1,21 @@ # ============================================================================= # .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts # -# Part of the Nomad+Vault migration (S0.5, issue #825). Locks in the -# "no-ad-hoc-steps" principle: every HCL/shell artifact under nomad/ or -# lib/init/nomad/, plus the `disinto init` dispatcher, gets checked -# before it can land. +# Part of the Nomad+Vault migration (S0.5, issue #825; extended in S2.6, +# issue #884). Locks in the "no-ad-hoc-steps" principle: every HCL/shell +# artifact under nomad/, lib/init/nomad/, vault/policies/, plus the +# `disinto init` dispatcher and vault/roles.yaml, gets checked before it +# can land. # # Triggers on PRs (and pushes) that touch any of: # nomad/** — HCL configs (server, client, vault) -# lib/init/nomad/** — cluster-up / install / systemd / vault-init +# lib/init/nomad/** — cluster-up / install / systemd / vault-init / +# vault-nomad-auth (S2.6 trigger: vault-*.sh +# is a subset of this glob) # bin/disinto — `disinto init --backend=nomad` dispatcher # tests/disinto-init-nomad.bats — the bats suite itself +# vault/policies/** — Vault ACL policy HCL files (S2.1, S2.6) +# vault/roles.yaml — JWT-auth role bindings (S2.3, S2.6) # .woodpecker/nomad-validate.yml — the pipeline definition # # Steps (all fail-closed — any error blocks merge): @@ -19,8 +24,22 @@ # nomad/jobs/*.hcl (new jobspecs get # CI coverage automatically) # 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl -# 4. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto -# 5. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests +# 4. vault-policy-fmt — `vault policy fmt` idempotence check on +# every vault/policies/*.hcl (format drift = +# CI fail; non-destructive via cp+diff) +# 5. vault-policy-validate — HCL syntax + capability validation for every +# vault/policies/*.hcl via `vault policy write` +# against an inline dev-mode Vault server +# 6. vault-roles-validate — yamllint + role→policy reference check on +# vault/roles.yaml (every referenced policy +# must exist as vault/policies/.hcl) +# 7. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto +# 8. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests +# +# Secret-scan coverage: vault/policies/*.hcl is already scanned by the +# P11 gate (.woodpecker/secret-scan.yml, issue #798) — its trigger path +# `vault/**/*` covers everything under this directory. We intentionally +# do NOT duplicate that gate here; one scanner, one source of truth. # # Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 / # vault 1.18.5). Bump there AND here together — drift = CI passing on @@ -34,6 +53,8 @@ when: - "lib/init/nomad/**" - "bin/disinto" - "tests/disinto-init-nomad.bats" + - "vault/policies/**" + - "vault/roles.yaml" - ".woodpecker/nomad-validate.yml" # Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is @@ -123,7 +144,176 @@ steps: *) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;; esac - # ── 4. Shellcheck ──────────────────────────────────────────────────────── + # ── 4. Vault policy fmt idempotence check ──────────────────────────────── + # `vault policy fmt ` formats a local HCL policy file in place. + # There's no `-check`/dry-run flag (vault 1.18.5), so we implement a + # non-destructive check as cp → fmt-on-copy → diff against original. + # Any diff means the committed file would be rewritten by `vault policy + # fmt` — failure steers the author to run `vault policy fmt ` + # locally before pushing. + # + # Scope: vault/policies/*.hcl only. The `[ -f "$f" ]` guard handles the + # no-match case (POSIX sh does not nullglob) so an empty policies/ + # directory does not fail this step. + # + # Note: `vault policy fmt` is purely local (HCL text transform) and does + # not require a running Vault server, which is why this step can run + # without starting one. + - name: vault-policy-fmt + image: hashicorp/vault:1.18.5 + commands: + - | + set -e + failed=0 + for f in vault/policies/*.hcl; do + [ -f "$f" ] || continue + tmp="/tmp/$(basename "$f").fmt" + cp "$f" "$tmp" + vault policy fmt "$tmp" >/dev/null 2>&1 + if ! diff -u "$f" "$tmp"; then + echo "ERROR: $f is not formatted — run 'vault policy fmt $f' locally" >&2 + failed=1 + fi + done + if [ "$failed" -gt 0 ]; then + echo "vault-policy-fmt: formatting drift detected" >&2 + exit 1 + fi + echo "vault-policy-fmt: all policies formatted correctly" + + # ── 5. Vault policy HCL syntax + capability validation ─────────────────── + # Vault has no offline `vault policy validate` subcommand — the closest + # in-CLI validator is `vault policy write`, which sends the HCL to a + # running server which parses it, checks capability names against the + # known set (read, list, create, update, delete, patch, sudo, deny), + # and rejects unknown stanzas / malformed path blocks. We start an + # inline dev-mode Vault (in-memory, no persistence, root token = "root") + # for the duration of this step and loop `vault policy write` over every + # vault/policies/*.hcl; the policies never leave the ephemeral dev + # server, so this is strictly a validator — not a deploy. + # + # Exit-code handling: + # - `vault policy write` exits 0 on success, non-zero on any parse / + # semantic error. We aggregate failures across all files so a single + # CI run surfaces every broken policy (not just the first). + # - The dev server is killed on any step exit via EXIT trap so the + # step tears down cleanly even on failure. + # + # Why dev-mode is sufficient: we're not persisting secrets, only asking + # Vault to parse policy text. The factory's production Vault is NOT + # contacted. + - name: vault-policy-validate + image: hashicorp/vault:1.18.5 + commands: + - | + set -e + vault server -dev -dev-root-token-id=root -dev-listen-address=127.0.0.1:8200 >/tmp/vault-dev.log 2>&1 & + VAULT_PID=$! + trap 'kill "$VAULT_PID" 2>/dev/null || true' EXIT INT TERM + export VAULT_ADDR=http://127.0.0.1:8200 + export VAULT_TOKEN=root + ready=0 + i=0 + while [ "$i" -lt 30 ]; do + if vault status >/dev/null 2>&1; then + ready=1 + break + fi + i=$((i + 1)) + sleep 0.5 + done + if [ "$ready" -ne 1 ]; then + echo "vault-policy-validate: dev server failed to start after 15s" >&2 + cat /tmp/vault-dev.log >&2 || true + exit 1 + fi + failed=0 + for f in vault/policies/*.hcl; do + [ -f "$f" ] || continue + name=$(basename "$f" .hcl) + echo "validate: $f" + if ! vault policy write "$name" "$f"; then + echo " ERROR: $f failed validation" >&2 + failed=1 + fi + done + if [ "$failed" -gt 0 ]; then + echo "vault-policy-validate: validation errors found" >&2 + exit 1 + fi + echo "vault-policy-validate: all policies valid" + + # ── 6. vault/roles.yaml validator ──────────────────────────────────────── + # Validates the JWT-auth role bindings file (S2.3). Two checks: + # + # a. `yamllint` — catches YAML syntax errors and indentation drift. + # Uses a relaxed config (line length bumped to 200) because + # roles.yaml's comments are wide by design. + # b. role → policy reference check — every role's `policy:` field + # must match a basename in vault/policies/*.hcl. A role pointing + # at a non-existent policy = runtime "permission denied" at job + # placement; catching the drift here turns it into a CI failure. + # Also verifies each role entry has the four required fields + # (name, policy, namespace, job_id) per the file's documented + # format. + # + # Parsing is done with PyYAML (the roles.yaml format is a strict + # subset that awk-level parsing in tools/vault-apply-roles.sh handles + # too, but PyYAML in CI gives us structural validation for free). If + # roles.yaml is ever absent (e.g. reverted), the step skips rather + # than fails — presence is enforced by S2.3's own tooling, not here. + - name: vault-roles-validate + image: python:3.12-alpine + commands: + - pip install --quiet --disable-pip-version-check pyyaml yamllint + - | + set -e + if [ ! -f vault/roles.yaml ]; then + echo "vault-roles-validate: vault/roles.yaml not present, skipping" + exit 0 + fi + yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}' vault/roles.yaml + echo "vault-roles-validate: yamllint OK" + python3 - <<'PY' + import os + import sys + import yaml + + with open('vault/roles.yaml') as f: + data = yaml.safe_load(f) or {} + roles = data.get('roles') or [] + if not roles: + print("vault-roles-validate: no roles defined in vault/roles.yaml", file=sys.stderr) + sys.exit(1) + existing = { + os.path.splitext(e)[0] + for e in os.listdir('vault/policies') + if e.endswith('.hcl') + } + required = ('name', 'policy', 'namespace', 'job_id') + failed = 0 + for r in roles: + if not isinstance(r, dict): + print(f"ERROR: role entry is not a mapping: {r!r}", file=sys.stderr) + failed = 1 + continue + for field in required: + if r.get(field) in (None, ''): + print(f"ERROR: role entry missing required field '{field}': {r}", file=sys.stderr) + failed = 1 + policy = r.get('policy') + if policy and policy not in existing: + print( + f"ERROR: role '{r.get('name')}' references policy '{policy}' " + f"but vault/policies/{policy}.hcl does not exist", + file=sys.stderr, + ) + failed = 1 + sys.exit(failed) + PY + echo "vault-roles-validate: all role→policy references valid" + + # ── 7. Shellcheck ──────────────────────────────────────────────────────── # Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns # the backend dispatcher). bin/disinto has no .sh extension so the # repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the @@ -133,7 +323,7 @@ steps: commands: - shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto - # ── 5. bats: `disinto init --backend=nomad --dry-run` ──────────────────── + # ── 8. bats: `disinto init --backend=nomad --dry-run` ──────────────────── # Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0 # with the expected step list, and --backend=docker stays on the docker # path (regression guard). Pure dry-run — no sudo, no network. diff --git a/AGENTS.md b/AGENTS.md index eec058c..28c37b2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ - + # Disinto — Agent Instructions ## What this repo is @@ -37,17 +37,20 @@ disinto/ (code repo) │ examples/ — example vault action TOMLs (promote, publish, release, webhook-call) ├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh │ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure) -│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825) +│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825); wp-oauth-register.sh (Forgejo OAuth2 app + Vault KV seeder for Woodpecker, S3.3) ├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh +│ jobs/ — Nomad jobspecs: forgejo.hcl (Vault secrets via template, S2.4); woodpecker-server.hcl + woodpecker-agent.hcl (host-net, docker.sock, Vault KV, S3.1-S3.2) ├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored) ├── formulas/ Issue templates (TOML specs for multi-step agent tasks) ├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/) ├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh) +│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh — Vault provisioning (S2.1/S2.2) +│ vault-seed-.sh — per-service Vault secret seeders; auto-invoked by `bin/disinto --with ` (add a new file to support a new service) ├── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md) ├── site/ disinto.ai website content -├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, disinto-init-nomad.bats) +├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, lib-generators.bats, vault-import.bats, disinto-init-nomad.bats) ├── templates/ Issue templates -├── bin/ The `disinto` CLI script +├── bin/ The `disinto` CLI script (`--with ` deploys services + runs their Vault seeders) ├── disinto-factory/ Setup documentation and skill ├── state/ Runtime state ├── .woodpecker/ Woodpecker CI pipeline configs @@ -120,8 +123,7 @@ bash dev/phase-test.sh | Reproduce | `docker/reproduce/` | Bug reproduction using Playwright MCP | `formulas/reproduce.toml` | | Triage | `docker/reproduce/` | Deep root cause analysis | `formulas/triage.toml` | | Edge dispatcher | `docker/edge/` | Polls ops repo for vault actions, executes via Claude sessions | `docker/edge/dispatcher.sh` | -| agents-llama | `docker/agents/` (same image) | Local-Qwen dev agent (`AGENT_ROLES=dev`), gated on `ENABLE_LLAMA_AGENT=1` | [docs/agents-llama.md](docs/agents-llama.md) | -| agents-llama-all | `docker/agents/` (same image) | Local-Qwen all-roles agent (all 7 roles), profile `agents-llama-all` | [docs/agents-llama.md](docs/agents-llama.md) | +| Local-model agents | `docker/agents/` (same image) | Local llama-server agents configured via `[agents.X]` sections in project TOML | [docs/agents-llama.md](docs/agents-llama.md) | > **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77). > See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow details. @@ -192,9 +194,7 @@ Humans write these. Agents read and enforce them. ## Phase-Signaling Protocol -When running as a persistent tmux session, Claude must signal the orchestrator -at each phase boundary by writing to a phase file (e.g. -`/tmp/dev-session-{project}-{issue}.phase`). +When running as a persistent tmux session, Claude must signal the orchestrator at each phase boundary by writing to a phase file (e.g. `/tmp/dev-session-{project}-{issue}.phase`). Key phases: `PHASE:awaiting_ci` → `PHASE:awaiting_review` → `PHASE:done`. Also: `PHASE:escalate` (needs human input), `PHASE:failed`. See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec, orchestrator reaction matrix, sequence diagram, and crash recovery. diff --git a/architect/AGENTS.md b/architect/AGENTS.md index 9582b03..1b2f9e8 100644 --- a/architect/AGENTS.md +++ b/architect/AGENTS.md @@ -1,4 +1,4 @@ - + # Architect — Agent Instructions ## What this agent is diff --git a/bin/disinto b/bin/disinto index 6128b7c..39817cf 100755 --- a/bin/disinto +++ b/bin/disinto @@ -82,13 +82,16 @@ Init options: --ci-id Woodpecker CI repo ID (default: 0 = no CI) --forge-url Forge base URL (default: http://localhost:3000) --backend Orchestration backend: docker (default) | nomad - --with (nomad) Deploy services: forgejo[,...] (S1.3) + --with (nomad) Deploy services: forgejo,woodpecker[,...] (S1.3, S3.4) --empty (nomad) Bring up cluster only, no jobs (S0.4) --bare Skip compose generation (bare-metal setup) --build Use local docker build instead of registry images (dev mode) --yes Skip confirmation prompts --rotate-tokens Force regeneration of all bot tokens/passwords (idempotent by default) --dry-run Print every intended action without executing + --import-env (nomad) Path to .env file for import into Vault KV (S2.5) + --import-sops (nomad) Path to sops-encrypted .env.vault.enc for import (S2.5) + --age-key (nomad) Path to age keyfile (required with --import-sops) (S2.5) Hire an agent options: --formula Path to role formula TOML (default: formulas/.toml) @@ -664,8 +667,13 @@ prompt_admin_password() { # `sudo disinto init ...` directly. _disinto_init_nomad() { local dry_run="${1:-false}" empty="${2:-false}" with_services="${3:-}" + local import_env="${4:-}" import_sops="${5:-}" age_key="${6:-}" local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh" local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh" + local vault_engines_sh="${FACTORY_ROOT}/lib/init/nomad/vault-engines.sh" + local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh" + local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh" + local vault_import_sh="${FACTORY_ROOT}/tools/vault-import.sh" if [ ! -x "$cluster_up" ]; then echo "Error: ${cluster_up} not found or not executable" >&2 @@ -677,6 +685,42 @@ _disinto_init_nomad() { exit 1 fi + # --empty short-circuits after cluster-up: no policies, no auth, no + # import, no deploy. It's the "cluster-only escape hatch" for debugging + # (docs/nomad-migration.md). Caller-side validation already rejects + # --empty combined with --with or any --import-* flag, so reaching + # this branch with those set is a bug in the caller. + # + # On the default (non-empty) path, vault-engines.sh (enables the kv/ + # mount), vault-apply-policies.sh, and vault-nomad-auth.sh are invoked + # unconditionally — they are idempotent and cheap to re-run, and + # subsequent --with deployments depend on them. vault-import.sh is + # invoked only when an --import-* flag is set. vault-engines.sh runs + # first because every policy and role below references kv/disinto/* + # paths, which 403 if the engine is not yet mounted (issue #912). + local import_any=false + if [ -n "$import_env" ] || [ -n "$import_sops" ]; then + import_any=true + fi + if [ "$empty" != "true" ]; then + if [ ! -x "$vault_engines_sh" ]; then + echo "Error: ${vault_engines_sh} not found or not executable" >&2 + exit 1 + fi + if [ ! -x "$vault_policies_sh" ]; then + echo "Error: ${vault_policies_sh} not found or not executable" >&2 + exit 1 + fi + if [ ! -x "$vault_auth_sh" ]; then + echo "Error: ${vault_auth_sh} not found or not executable" >&2 + exit 1 + fi + if [ "$import_any" = true ] && [ ! -x "$vault_import_sh" ]; then + echo "Error: ${vault_import_sh} not found or not executable" >&2 + exit 1 + fi + fi + # --empty and default both invoke cluster-up today. Log the requested # mode so the dispatch is visible in factory bootstrap logs — Step 1 # will branch on $empty to gate the job-deployment path. @@ -686,7 +730,7 @@ _disinto_init_nomad() { echo "nomad backend: default (cluster-up; jobs deferred to Step 1)" fi - # Dry-run: print cluster-up plan + deploy.sh plan + # Dry-run: print cluster-up plan + policies/auth/import plan + deploy.sh plan if [ "$dry_run" = "true" ]; then echo "" echo "── Cluster-up dry-run ─────────────────────────────────" @@ -694,20 +738,94 @@ _disinto_init_nomad() { "${cmd[@]}" || true echo "" + # --empty skips policies/auth/import/deploy — cluster-up only, no + # workloads. The operator-visible dry-run plan must match the real + # run, so short-circuit here too. + if [ "$empty" = "true" ]; then + exit 0 + fi + + # Vault engines + policies + auth are invoked on every nomad real-run + # path regardless of --import-* flags (they're idempotent; S2.1 + S2.3). + # Engines runs first because policies/roles/templates all reference the + # kv/ mount it enables (issue #912). Mirror that ordering in the + # dry-run plan so the operator sees the full sequence Step 2 will + # execute. + echo "── Vault engines dry-run ──────────────────────────────" + echo "[engines] [dry-run] ${vault_engines_sh} --dry-run" + echo "" + echo "── Vault policies dry-run ─────────────────────────────" + echo "[policies] [dry-run] ${vault_policies_sh} --dry-run" + echo "" + echo "── Vault auth dry-run ─────────────────────────────────" + echo "[auth] [dry-run] ${vault_auth_sh}" + echo "" + + # Import plan: one line per --import-* flag that is actually set. + # Printing independently (not in an if/elif chain) means that all + # three flags appearing together each echo their own path — the + # regression that bit prior implementations of this issue (#883). + if [ "$import_any" = true ]; then + echo "── Vault import dry-run ───────────────────────────────" + [ -n "$import_env" ] && echo "[import] --import-env env file: ${import_env}" + [ -n "$import_sops" ] && echo "[import] --import-sops sops file: ${import_sops}" + [ -n "$age_key" ] && echo "[import] --age-key age key: ${age_key}" + local -a import_dry_cmd=("$vault_import_sh") + [ -n "$import_env" ] && import_dry_cmd+=("--env" "$import_env") + [ -n "$import_sops" ] && import_dry_cmd+=("--sops" "$import_sops") + [ -n "$age_key" ] && import_dry_cmd+=("--age-key" "$age_key") + import_dry_cmd+=("--dry-run") + echo "[import] [dry-run] ${import_dry_cmd[*]}" + echo "" + else + echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" + echo "" + fi + if [ -n "$with_services" ]; then - echo "── Deploy services dry-run ────────────────────────────" - echo "[deploy] services to deploy: ${with_services}" + # Vault seed plan (S2.6, #928): one line per service whose + # tools/vault-seed-.sh ships. Sub-services (woodpecker-server, + # woodpecker-agent) map to their parent seeder (vault-seed-woodpecker.sh). + # Deduplicated so the seeder runs once even when both sub-services + # are present. + local seed_hdr_printed=false + local _seed_seen="" local IFS=',' for svc in $with_services; do svc=$(echo "$svc" | xargs) # trim whitespace - # Validate known services first + # Map sub-services to parent seed name + local seed_name="$svc" case "$svc" in - forgejo) ;; - *) - echo "Error: unknown service '${svc}' — known: forgejo" >&2 - exit 1 - ;; + woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; esac + # Deduplicate + if echo ",$_seed_seen," | grep -q ",$seed_name,"; then continue; fi + _seed_seen="${_seed_seen:+${_seed_seen},}${seed_name}" + local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" + if [ -x "$seed_script" ]; then + if [ "$seed_hdr_printed" = false ]; then + echo "── Vault seed dry-run ─────────────────────────────────" + seed_hdr_printed=true + fi + echo "[seed] [dry-run] ${seed_script} --dry-run" + fi + done + [ "$seed_hdr_printed" = true ] && echo "" + + echo "── Deploy services dry-run ────────────────────────────" + echo "[deploy] services to deploy: ${with_services}" + + # Build ordered deploy list: only include services present in with_services + local DEPLOY_ORDER="" + for ordered_svc in forgejo woodpecker-server woodpecker-agent; do + if echo ",$with_services," | grep -q ",$ordered_svc,"; then + DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" + fi + done + echo "[deploy] deployment order: ${DEPLOY_ORDER}" + + local IFS=' ' + for svc in $DEPLOY_ORDER; do local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 @@ -721,7 +839,7 @@ _disinto_init_nomad() { exit 0 fi - # Real run: cluster-up + deploy services + # Real run: cluster-up + policies + auth + (optional) import + deploy local -a cluster_cmd=("$cluster_up") if [ "$(id -u)" -eq 0 ]; then "${cluster_cmd[@]}" || exit $? @@ -733,27 +851,147 @@ _disinto_init_nomad() { sudo -n -- "${cluster_cmd[@]}" || exit $? fi + # --empty short-circuits here: cluster-up only, no policies/auth/import + # and no deploy. Matches the dry-run plan above and the docs/runbook. + if [ "$empty" = "true" ]; then + exit 0 + fi + + # Enable Vault secret engines (S2.1 / issue #912) — must precede + # policies/auth/import because every policy and every import target + # addresses paths under kv/. Idempotent, safe to re-run. + echo "" + echo "── Enabling Vault secret engines ──────────────────────" + local -a engines_cmd=("$vault_engines_sh") + if [ "$(id -u)" -eq 0 ]; then + "${engines_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-engines.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${engines_cmd[@]}" || exit $? + fi + + # Apply Vault policies (S2.1) — idempotent, safe to re-run. + echo "" + echo "── Applying Vault policies ────────────────────────────" + local -a policies_cmd=("$vault_policies_sh") + if [ "$(id -u)" -eq 0 ]; then + "${policies_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-apply-policies.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${policies_cmd[@]}" || exit $? + fi + + # Configure Vault JWT auth + Nomad workload identity (S2.3) — idempotent. + echo "" + echo "── Configuring Vault JWT auth ─────────────────────────" + local -a auth_cmd=("$vault_auth_sh") + if [ "$(id -u)" -eq 0 ]; then + "${auth_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-nomad-auth.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${auth_cmd[@]}" || exit $? + fi + + # Import secrets if any --import-* flag is set (S2.2). + if [ "$import_any" = true ]; then + echo "" + echo "── Importing secrets into Vault ───────────────────────" + local -a import_cmd=("$vault_import_sh") + [ -n "$import_env" ] && import_cmd+=("--env" "$import_env") + [ -n "$import_sops" ] && import_cmd+=("--sops" "$import_sops") + [ -n "$age_key" ] && import_cmd+=("--age-key" "$age_key") + if [ "$(id -u)" -eq 0 ]; then + "${import_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-import.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${import_cmd[@]}" || exit $? + fi + else + echo "" + echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" + fi + + # Seed Vault for services that ship their own seeder (S2.6, #928). + # Convention: tools/vault-seed-.sh — auto-invoked when --with + # is requested. Runs AFTER vault-import so that real imported values + # win over generated seeds when both are present; each seeder is + # idempotent on a per-key basis (see vault-seed-forgejo.sh's + # "missing → generate, present → unchanged" contract), so re-running + # init does not rotate existing keys. Services without a seeder are + # silently skipped — keeps this loop forward-compatible with Step 3+ + # services that may ship their own seeder without touching bin/disinto. + # + # VAULT_ADDR is passed explicitly because cluster-up.sh writes the + # profile.d export *during* this same init run, so the current shell + # hasn't sourced it yet; sibling vault-* scripts (engines/policies/ + # auth/import) default VAULT_ADDR internally via _hvault_default_env, + # but vault-seed-forgejo.sh requires the caller to set it. + # + # The non-root branch invokes the seeder as `sudo -n -- env VAR=val + # script` rather than `sudo -n VAR=val -- script`: sudo treats bare + # `VAR=val` args as sudoers env-assignments, which the default + # `env_reset=on` policy silently discards unless the variable is in + # `env_keep` (VAULT_ADDR is not). Using `env` as the actual command + # sets VAULT_ADDR in the child process regardless of sudoers policy. + if [ -n "$with_services" ]; then + local vault_addr="${VAULT_ADDR:-http://127.0.0.1:8200}" + local _seed_seen="" + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + # Map sub-services to parent seed name (S3.4) + local seed_name="$svc" + case "$svc" in + woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; + esac + # Deduplicate + if echo ",$_seed_seen," | grep -q ",$seed_name,"; then continue; fi + _seed_seen="${_seed_seen:+${_seed_seen},}${seed_name}" + local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" + if [ -x "$seed_script" ]; then + echo "" + echo "── Seeding Vault for ${seed_name} ───────────────────────────" + if [ "$(id -u)" -eq 0 ]; then + VAULT_ADDR="$vault_addr" "$seed_script" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-seed-${seed_name}.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- env "VAULT_ADDR=$vault_addr" "$seed_script" || exit $? + fi + fi + done + fi + # Deploy services if requested if [ -n "$with_services" ]; then echo "" echo "── Deploying services ─────────────────────────────────" - local -a deploy_cmd=("$deploy_sh") - # Split comma-separated service list into positional args - local IFS=',' - for svc in $with_services; do - svc=$(echo "$svc" | xargs) # trim whitespace - if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then - echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2 - exit 1 + + # Build ordered deploy list (S3.4): forgejo → woodpecker-server → woodpecker-agent + local DEPLOY_ORDER="" + for ordered_svc in forgejo woodpecker-server woodpecker-agent; do + if echo ",$with_services," | grep -q ",$ordered_svc,"; then + DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" fi - # Validate known services FIRST (before jobspec check) - case "$svc" in - forgejo) ;; - *) - echo "Error: unknown service '${svc}' — known: forgejo" >&2 - exit 1 - ;; - esac + done + + local -a deploy_cmd=("$deploy_sh") + local IFS=' ' + for svc in $DEPLOY_ORDER; do # Check jobspec exists local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then @@ -777,10 +1015,26 @@ _disinto_init_nomad() { echo "" echo "── Summary ────────────────────────────────────────────" echo "Cluster: Nomad+Vault cluster is up" + echo "Policies: applied (Vault ACL)" + echo "Auth: Vault JWT auth + Nomad workload identity configured" + if [ "$import_any" = true ]; then + local import_desc="" + [ -n "$import_env" ] && import_desc+="${import_env} " + [ -n "$import_sops" ] && import_desc+="${import_sops} " + echo "Imported: ${import_desc% }" + else + echo "Imported: (none — seed kv/disinto/* manually before deploying secret-dependent services)" + fi echo "Deployed: ${with_services}" - if echo "$with_services" | grep -q "forgejo"; then + if echo ",$with_services," | grep -q ",forgejo,"; then echo "Ports: forgejo: 3000" fi + if echo ",$with_services," | grep -q ",woodpecker-server,"; then + echo " woodpecker-server: 8000" + fi + if echo ",$with_services," | grep -q ",woodpecker-agent,"; then + echo " woodpecker-agent: (agent connected)" + fi echo "────────────────────────────────────────────────────────" fi @@ -803,6 +1057,7 @@ disinto_init() { # Parse flags local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services="" + local import_env="" import_sops="" age_key="" while [ $# -gt 0 ]; do case "$1" in --branch) branch="$2"; shift 2 ;; @@ -819,6 +1074,12 @@ disinto_init() { --yes) auto_yes=true; shift ;; --rotate-tokens) rotate_tokens=true; shift ;; --dry-run) dry_run=true; shift ;; + --import-env) import_env="$2"; shift 2 ;; + --import-env=*) import_env="${1#--import-env=}"; shift ;; + --import-sops) import_sops="$2"; shift 2 ;; + --import-sops=*) import_sops="${1#--import-sops=}"; shift ;; + --age-key) age_key="$2"; shift 2 ;; + --age-key=*) age_key="${1#--age-key=}"; shift ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac done @@ -859,11 +1120,80 @@ disinto_init() { exit 1 fi + # Normalize --with services (S3.4): expand 'woodpecker' shorthand to + # 'woodpecker-server,woodpecker-agent', auto-include forgejo when + # woodpecker is requested (OAuth dependency), and validate all names. + if [ -n "$with_services" ]; then + # Expand 'woodpecker' (bare) → 'woodpecker-server,woodpecker-agent'. + # Must not match already-expanded 'woodpecker-server'/'woodpecker-agent'. + local expanded="" + local IFS=',' + for _svc in $with_services; do + _svc=$(echo "$_svc" | xargs) + case "$_svc" in + woodpecker) _svc="woodpecker-server,woodpecker-agent" ;; + esac + expanded="${expanded:+${expanded},}${_svc}" + done + with_services="$expanded" + unset IFS + + # Auto-include forgejo when woodpecker is requested + if echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent," \ + && ! echo ",$with_services," | grep -q ",forgejo,"; then + echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)" + with_services="forgejo,${with_services}" + fi + + # Validate all service names are known + local IFS=',' + for _svc in $with_services; do + _svc=$(echo "$_svc" | xargs) + case "$_svc" in + forgejo|woodpecker-server|woodpecker-agent) ;; + *) + echo "Error: unknown service '${_svc}' — known: forgejo, woodpecker-server, woodpecker-agent" >&2 + exit 1 + ;; + esac + done + unset IFS + fi + + # --import-* flag validation (S2.5). These three flags form an import + # triple and must be consistent before dispatch: sops encryption is + # useless without the age key to decrypt it, so either both --import-sops + # and --age-key are present or neither is. --import-env alone is fine + # (it just imports the plaintext dotenv). All three flags are nomad-only. + if [ -n "$import_sops" ] && [ -z "$age_key" ]; then + echo "Error: --import-sops requires --age-key" >&2 + exit 1 + fi + if [ -n "$age_key" ] && [ -z "$import_sops" ]; then + echo "Error: --age-key requires --import-sops" >&2 + exit 1 + fi + if { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; } \ + && [ "$backend" != "nomad" ]; then + echo "Error: --import-env, --import-sops, and --age-key require --backend=nomad" >&2 + exit 1 + fi + + # --empty is the cluster-only escape hatch — it skips policies, auth, + # import, and deploy. Pairing it with --import-* silently does nothing, + # which is a worse failure mode than a clear error. Reject explicitly. + if [ "$empty" = true ] \ + && { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; }; then + echo "Error: --empty and --import-env/--import-sops/--age-key are mutually exclusive" >&2 + exit 1 + fi + # Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh # (S0.4). The default and --empty variants are identical today; Step 1 # will branch on $empty to add job deployment to the default path. if [ "$backend" = "nomad" ]; then - _disinto_init_nomad "$dry_run" "$empty" "$with_services" + _disinto_init_nomad "$dry_run" "$empty" "$with_services" \ + "$import_env" "$import_sops" "$age_key" # shellcheck disable=SC2317 # _disinto_init_nomad always exits today; # `return` is defensive against future refactors. return @@ -977,7 +1307,6 @@ p.write_text(text) echo "" echo "[ensure] Forgejo admin user 'disinto-admin'" echo "[ensure] 8 bot users: dev-bot, review-bot, planner-bot, gardener-bot, vault-bot, supervisor-bot, predictor-bot, architect-bot" - echo "[ensure] 2 llama bot users: dev-qwen, dev-qwen-nightly" echo "[ensure] .profile repos for all bots" echo "[ensure] repo ${forge_repo} on Forgejo with collaborators" echo "[run] preflight checks" @@ -1173,19 +1502,6 @@ p.write_text(text) echo "Config: CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 saved to .env" fi - # Write local-Qwen dev agent env keys with safe defaults (#769) - if ! grep -q '^ENABLE_LLAMA_AGENT=' "$env_file" 2>/dev/null; then - cat >> "$env_file" <<'LLAMAENVEOF' - -# Local Qwen dev agent (optional) — set to 1 to enable -ENABLE_LLAMA_AGENT=0 -FORGE_TOKEN_LLAMA= -FORGE_PASS_LLAMA= -ANTHROPIC_BASE_URL= -LLAMAENVEOF - echo "Config: ENABLE_LLAMA_AGENT keys written to .env (disabled by default)" - fi - # Create labels on remote create_labels "$forge_repo" "$forge_url" diff --git a/dev/AGENTS.md b/dev/AGENTS.md index 481bb1f..0d565c3 100644 --- a/dev/AGENTS.md +++ b/dev/AGENTS.md @@ -1,4 +1,4 @@ - + # Dev Agent **Role**: Implement issues autonomously — write code, push branches, address diff --git a/docker/agents/Dockerfile b/docker/agents/Dockerfile index 2939230..1bcba89 100644 --- a/docker/agents/Dockerfile +++ b/docker/agents/Dockerfile @@ -2,7 +2,7 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ - && pip3 install --break-system-packages networkx \ + && pip3 install --break-system-packages networkx tomlkit \ && rm -rf /var/lib/apt/lists/* # Pre-built binaries (copied from docker/agents/bin/) diff --git a/docker/agents/entrypoint.sh b/docker/agents/entrypoint.sh index a664a09..7c58674 100644 --- a/docker/agents/entrypoint.sh +++ b/docker/agents/entrypoint.sh @@ -17,6 +17,38 @@ set -euo pipefail # - predictor: every 24 hours (288 iterations * 5 min) # - supervisor: every SUPERVISOR_INTERVAL seconds (default: 1200 = 20 min) +# ── Migration check: reject ENABLE_LLAMA_AGENT ─────────────────────────────── +# #846: The legacy ENABLE_LLAMA_AGENT env flag is no longer supported. +# Activation is now done exclusively via [agents.X] sections in project TOML. +# If this legacy flag is detected, fail immediately with a migration message. +if [ "${ENABLE_LLAMA_AGENT:-}" = "1" ]; then + cat <<'MIGRATION_ERR' +FATAL: ENABLE_LLAMA_AGENT is no longer supported. + +The legacy ENABLE_LLAMA_AGENT=1 flag has been removed (#846). +Activation is now done exclusively via [agents.X] sections in projects/*.toml. + +To migrate: + 1. Remove ENABLE_LLAMA_AGENT from your .env or .env.enc file + 2. Add an [agents.] section to your project TOML: + + [agents.dev-qwen] + base_url = "http://your-llama-server:8081" + model = "unsloth/Qwen3.5-35B-A3B" + api_key = "sk-no-key-required" + roles = ["dev"] + forge_user = "dev-qwen" + compact_pct = 60 + poll_interval = 60 + + 3. Run: disinto init + 4. Start the agent: docker compose up -d agents-dev-qwen + +See docs/agents-llama.md for full details. +MIGRATION_ERR + exit 1 +fi + DISINTO_BAKED="/home/agent/disinto" DISINTO_LIVE="/home/agent/repos/_factory" DISINTO_DIR="$DISINTO_BAKED" # start with baked copy; switched to live checkout after bootstrap @@ -342,9 +374,32 @@ bootstrap_ops_repos # Bootstrap factory repo — switch DISINTO_DIR to live checkout (#593) bootstrap_factory_repo +# Validate that projects directory has at least one real .toml file (not .example) +# This prevents the silent-zombie mode where the polling loop matches zero files +# and does nothing forever. +validate_projects_dir() { + # NOTE: compgen -G exits non-zero when no matches exist, so piping it through + # `wc -l` under `set -eo pipefail` aborts the script before the FATAL branch + # can log a diagnostic (#877). Use the conditional form already adopted at + # lines above (see bootstrap_factory_repo, PROJECT_NAME parsing). + if ! compgen -G "${DISINTO_DIR}/projects/*.toml" >/dev/null 2>&1; then + log "FATAL: No real .toml files found in ${DISINTO_DIR}/projects/" + log "Expected at least one project config file (e.g., disinto.toml)" + log "The directory only contains *.toml.example template files." + log "Mount the host ./projects volume or copy real .toml files into the container." + exit 1 + fi + local toml_count + toml_count=$(compgen -G "${DISINTO_DIR}/projects/*.toml" | wc -l) + log "Projects directory validated: ${toml_count} real .toml file(s) found" +} + # Initialize state directory for check_active guards init_state_dir +# Validate projects directory before entering polling loop +validate_projects_dir + # Parse AGENT_ROLES env var (default: all agents) # Expected format: comma-separated list like "review,dev,gardener" AGENT_ROLES="${AGENT_ROLES:-review,dev,gardener,architect,planner,predictor,supervisor}" diff --git a/docs/agents-llama.md b/docs/agents-llama.md index bc973b7..b3a1334 100644 --- a/docs/agents-llama.md +++ b/docs/agents-llama.md @@ -2,9 +2,12 @@ Local-model agents run the same agent code as the Claude-backed agents, but connect to a local llama-server (or compatible OpenAI-API endpoint) instead of -the Anthropic API. This document describes the current activation flow using +the Anthropic API. This document describes the canonical activation flow using `disinto hire-an-agent` and `[agents.X]` TOML configuration. +> **Note:** The legacy `ENABLE_LLAMA_AGENT=1` env flag has been removed (#846). +> Activation is now done exclusively via `[agents.X]` sections in project TOML. + ## Overview Local-model agents are configured via `[agents.]` sections in diff --git a/docs/nomad-migration.md b/docs/nomad-migration.md new file mode 100644 index 0000000..02ff023 --- /dev/null +++ b/docs/nomad-migration.md @@ -0,0 +1,124 @@ + +# Nomad+Vault migration — cutover-day runbook + +`disinto init --backend=nomad` is the single entry-point that turns a fresh +LXC (with the disinto repo cloned) into a running Nomad+Vault cluster with +policies applied, JWT workload-identity auth configured, secrets imported +from the old docker stack, and services deployed. + +## Cutover-day invocation + +On the new LXC, as root (or an operator with NOPASSWD sudo): + +```bash +# Copy the plaintext .env + sops-encrypted .env.vault.enc + age keyfile +# from the old box first (out of band — SSH, USB, whatever your ops +# procedure allows). Then: + +sudo ./bin/disinto init \ + --backend=nomad \ + --import-env /tmp/.env \ + --import-sops /tmp/.env.vault.enc \ + --age-key /tmp/keys.txt \ + --with forgejo +``` + +This runs, in order: + +1. **`lib/init/nomad/cluster-up.sh`** (S0) — installs Nomad + Vault + binaries, writes `/etc/nomad.d/*`, initializes Vault, starts both + services, waits for the Nomad node to become ready. +2. **`tools/vault-apply-policies.sh`** (S2.1) — syncs every + `vault/policies/*.hcl` into Vault as an ACL policy. Idempotent. +3. **`lib/init/nomad/vault-nomad-auth.sh`** (S2.3) — enables Vault's + JWT auth method at `jwt-nomad`, points it at Nomad's JWKS, writes + one role per policy, reloads Nomad so jobs can exchange + workload-identity tokens for Vault tokens. Idempotent. +4. **`tools/vault-import.sh`** (S2.2) — reads `/tmp/.env` and the + sops-decrypted `/tmp/.env.vault.enc`, writes them to the KV paths + matching the S2.1 policy layout (`kv/disinto/bots/*`, `kv/disinto/shared/*`, + `kv/disinto/runner/*`). Idempotent (overwrites KV v2 data in place). +5. **`lib/init/nomad/deploy.sh forgejo`** (S1) — validates + runs the + `nomad/jobs/forgejo.hcl` jobspec. Forgejo reads its admin creds from + Vault via the `template` stanza (S2.4). + +## Flag summary + +| Flag | Meaning | +|---|---| +| `--backend=nomad` | Switch the init dispatcher to the Nomad+Vault path (instead of docker compose). | +| `--empty` | Bring the cluster up, skip policies/auth/import/deploy. Escape hatch for debugging. | +| `--with forgejo[,…]` | Deploy these services after the cluster is up. | +| `--import-env PATH` | Plaintext `.env` from the old stack. Optional. | +| `--import-sops PATH` | Sops-encrypted `.env.vault.enc` from the old stack. Requires `--age-key`. | +| `--age-key PATH` | Age keyfile used to decrypt `--import-sops`. Requires `--import-sops`. | +| `--dry-run` | Print the full plan (cluster-up + policies + auth + import + deploy) and exit. Touches nothing. | + +### Flag validation + +- `--import-sops` without `--age-key` → error. +- `--age-key` without `--import-sops` → error. +- `--import-env` alone (no sops) → OK (imports just the plaintext `.env`). +- `--backend=docker` with any `--import-*` flag → error. +- `--empty` with any `--import-*` flag → error (mutually exclusive: `--empty` + skips the import step, so pairing them silently discards the import + intent). + +## Idempotency + +Every layer is idempotent by design. Re-running the same command on an +already-provisioned box is a no-op at every step: + +- **Cluster-up:** second run detects running `nomad`/`vault` systemd + units and state files, skips re-init. +- **Policies:** byte-for-byte compare against on-server policy text; + "unchanged" for every untouched file. +- **Auth:** skips auth-method create if `jwt-nomad/` already enabled, + skips config write if the JWKS + algs match, skips server.hcl write if + the file on disk is identical to the repo copy. +- **Import:** KV v2 writes overwrite in place (same path, same keys, + same values → no new version). +- **Deploy:** `nomad job run` is declarative; same jobspec → no new + allocation. + +## Dry-run + +```bash +./bin/disinto init --backend=nomad \ + --import-env /tmp/.env \ + --import-sops /tmp/.env.vault.enc \ + --age-key /tmp/keys.txt \ + --with forgejo \ + --dry-run +``` + +Prints the five-section plan — cluster-up, policies, auth, import, +deploy — with every path and every argv that would be executed. No +network, no sudo, no state mutation. See +`tests/disinto-init-nomad.bats` for the exact output shape. + +## No-import path + +If you already have `kv/disinto/*` seeded by other means (manual +`vault kv put`, a replica, etc.), omit all three `--import-*` flags. +`disinto init --backend=nomad --with forgejo` still applies policies, +configures auth, and deploys — but skips the import step with: + +``` +[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services +``` + +Forgejo's template stanza will fail to render (and thus the allocation +will stall) until those KV paths exist — so either import them or seed +them first. + +## Secret hygiene + +- Never log a secret value. The CLI only prints paths (`--import-env`, + `--age-key`) and KV *paths* (`kv/disinto/bots/review/token`), never + the values themselves. `tools/vault-import.sh` is the only thing that + reads the values, and it pipes them directly into Vault's HTTP API. +- The age keyfile must be mode 0400 — `vault-import.sh` refuses to + source a keyfile with looser permissions. +- `VAULT_ADDR` must be localhost during import — the import tool + refuses to run against a remote Vault, preventing accidental exposure. diff --git a/formulas/release.sh b/formulas/release.sh index b8c4eb6..6526d1a 100644 --- a/formulas/release.sh +++ b/formulas/release.sh @@ -178,8 +178,8 @@ log "Tagged disinto/agents:${RELEASE_VERSION}" log "Step 6/6: Restarting agent containers" -docker compose stop agents agents-llama 2>/dev/null || true -docker compose up -d agents agents-llama +docker compose stop agents 2>/dev/null || true +docker compose up -d agents log "Agent containers restarted" # ── Done ───────────────────────────────────────────────────────────────── diff --git a/formulas/release.toml b/formulas/release.toml index f702f42..ccd7f95 100644 --- a/formulas/release.toml +++ b/formulas/release.toml @@ -189,10 +189,10 @@ Restart agent containers to use the new image. - docker compose pull agents 2. Stop and remove existing agent containers: - - docker compose down agents agents-llama 2>/dev/null || true + - docker compose down agents 3. Start agents with new image: - - docker compose up -d agents agents-llama + - docker compose up -d agents 4. Wait for containers to be healthy: - for i in {1..30}; do @@ -203,7 +203,7 @@ Restart agent containers to use the new image. - done 5. Verify containers are running: - - docker compose ps agents agents-llama + - docker compose ps agents 6. Log restart: - echo "Restarted agents containers" diff --git a/formulas/run-supervisor.toml b/formulas/run-supervisor.toml index f31e6bc..4101252 100644 --- a/formulas/run-supervisor.toml +++ b/formulas/run-supervisor.toml @@ -29,7 +29,7 @@ and injected into your prompt above. Review them now. 1. Read the injected metrics data carefully (System Resources, Docker, Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs, - CI Pipelines, Open PRs, Issue Status, Stale Worktrees). + CI Pipelines, Open PRs, Issue Status, Stale Worktrees, **Woodpecker Agent Health**). Note: preflight.sh auto-removes PHASE:escalate files for closed issues (24h grace period). Check the "Stale Phase Cleanup" section for any files cleaned or in grace period this run. @@ -75,6 +75,10 @@ Categorize every finding from the metrics into priority levels. - Dev/action sessions in PHASE:escalate for > 24h (session timeout) (Note: PHASE:escalate files for closed issues are auto-cleaned by preflight; this check covers sessions where the issue is still open) +- **Woodpecker agent unhealthy** — see "Woodpecker Agent Health" section in preflight: + - Container not running or in unhealthy state + - gRPC errors >= 3 in last 20 minutes + - Fast-failure pipelines (duration < 60s) >= 3 in last 15 minutes ### P3 — Factory degraded - PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed @@ -100,6 +104,15 @@ For each finding from the health assessment, decide and execute an action. ### Auto-fixable (execute these directly) +**P2 Woodpecker agent unhealthy:** +The supervisor-run.sh script automatically handles WP agent recovery: +- Detects unhealthy state via preflight.sh health checks +- Restarts container via `docker restart` +- Scans for `blocked: ci_exhausted` issues updated in last 30 minutes +- Unassigns and removes blocked label from affected issues +- Posts recovery comment with infra-flake context +- Avoids duplicate restarts via 5-minute cooldown in history file + **P0 Memory crisis:** # Kill stale one-shot claude processes (>3h old) pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true @@ -248,6 +261,11 @@ Format: - (or "No actions needed") + ### WP Agent Recovery (if applicable) + - WP agent restart: