diff --git a/.env.example b/.env.example index a1f24d5..c1c0b98 100644 --- a/.env.example +++ b/.env.example @@ -32,10 +32,13 @@ FORGE_URL=http://localhost:3000 # [CONFIG] local Forgejo instance # - FORGE_PASS_DEV_QWEN2 # Name conversion: tr 'a-z-' 'A-Z_' (lowercase→UPPER, hyphens→underscores). # The compose generator looks these up via the agent's `forge_user` field in -# the project TOML. Configure local-model agents via [agents.X] sections in -# projects/*.toml — this is the canonical activation path. +# the project TOML. The pre-existing `dev-qwen` llama agent uses +# FORGE_TOKEN_LLAMA / FORGE_PASS_LLAMA (kept for backwards-compat with the +# legacy `ENABLE_LLAMA_AGENT=1` single-agent path). FORGE_TOKEN= # [SECRET] dev-bot API token (default for all agents) FORGE_PASS= # [SECRET] dev-bot password for git HTTP push (#361) +FORGE_TOKEN_LLAMA= # [SECRET] dev-qwen API token (for agents-llama) +FORGE_PASS_LLAMA= # [SECRET] dev-qwen password for git HTTP push FORGE_REVIEW_TOKEN= # [SECRET] review-bot API token FORGE_REVIEW_PASS= # [SECRET] review-bot password for git HTTP push FORGE_PLANNER_TOKEN= # [SECRET] planner-bot API token @@ -104,6 +107,13 @@ FORWARD_AUTH_SECRET= # [SECRET] Shared secret for Caddy ↔ # Store all project secrets here so formulas reference env vars, never hardcode. BASE_RPC_URL= # [SECRET] on-chain RPC endpoint +# ── Local Qwen dev agent (optional) ────────────────────────────────────── +# Set ENABLE_LLAMA_AGENT=1 to emit agents-llama in docker-compose.yml. +# Requires a running llama-server reachable at ANTHROPIC_BASE_URL. +# See docs/agents-llama.md for details. +ENABLE_LLAMA_AGENT=0 # [CONFIG] 1 = enable agents-llama service +ANTHROPIC_BASE_URL= # [CONFIG] e.g. http://host.docker.internal:8081 + # ── Tuning ──────────────────────────────────────────────────────────────── CLAUDE_TIMEOUT=7200 # [CONFIG] max seconds per Claude invocation diff --git a/.gitignore b/.gitignore index a29450c..21c6fbc 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ metrics/supervisor-metrics.jsonl # OS .DS_Store dev/ci-fixes-*.json +gardener/dust.jsonl # Individual encrypted secrets (managed by disinto secrets add) secrets/ diff --git a/.woodpecker/detect-duplicates.py b/.woodpecker/detect-duplicates.py index f3bf5b1..0485833 100644 --- a/.woodpecker/detect-duplicates.py +++ b/.woodpecker/detect-duplicates.py @@ -294,35 +294,6 @@ def main() -> int: "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern", # Standard lib source block shared across formula-driven agent run scripts "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)", - # Common vault-seed script patterns: logging helpers + flag parsing - # Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh - "843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)", - "ee51df9642f2ef37af73b0c15f4d8406": "Logging helpers + DRY_RUN loop start (vault-seed-woodpecker + wp-oauth-register)", - "9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)", - "9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)", - "5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)", - # Common vault-seed script preamble + precondition patterns - # Shared across tools/vault-seed-{forgejo,agents,woodpecker}.sh - "dff3675c151fcdbd2fef798826ae919b": "Vault-seed preamble: set -euo + path setup + source hvault.sh + KV_MOUNT", - "1cd9f0d083e24e6e6b2071db9b6dae09": "Vault-seed preconditions: binary check loop + VAULT_ADDR guard", - "63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die", - "34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup", - "71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die", - # Common vault-seed script flag parsing patterns - # Shared across tools/vault-seed-{forgejo,ops-repo}.sh - "6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)", - "a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)", - "e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)", - "c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)", - "106f4071e88f841b3208b01144cd1c39": "vault-seed case pattern dry-run end (forgejo + ops-repo)", - "c15506dcb6bb340b25d1c39d442dd2e6": "vault-seed help text + invalid arg handler (forgejo + ops-repo)", - "1feecd3b3caf00045fae938ddf2811de": "vault-seed invalid arg handler (forgejo + ops-repo)", - "919780d5e7182715344f5aa02b191294": "vault-seed invalid arg + esac pattern (forgejo + ops-repo)", - "8dce1d292bce8e60ef4c0665b62945b0": "vault-seed esac + binary check loop (forgejo + ops-repo)", - "ca043687143a5b47bd54e65a99ce8ee8": "vault-seed binary check loop start (forgejo + ops-repo)", - "aefd9f655411a955395e6e5995ddbe6f": "vault-seed binary check pattern (forgejo + ops-repo)", - "60f0c46deb5491599457efb4048918e5": "vault-seed VAULT_ADDR + hvault_token_lookup check (forgejo + ops-repo)", - "f6838f581ef6b4d82b55268389032769": "vault-seed VAULT_ADDR + hvault_token_lookup die (forgejo + ops-repo)", } if not sh_files: diff --git a/.woodpecker/nomad-validate.yml b/.woodpecker/nomad-validate.yml index 5a1cc7c..81e45ae 100644 --- a/.woodpecker/nomad-validate.yml +++ b/.woodpecker/nomad-validate.yml @@ -1,21 +1,16 @@ # ============================================================================= # .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts # -# Part of the Nomad+Vault migration (S0.5, issue #825; extended in S2.6, -# issue #884). Locks in the "no-ad-hoc-steps" principle: every HCL/shell -# artifact under nomad/, lib/init/nomad/, vault/policies/, plus the -# `disinto init` dispatcher and vault/roles.yaml, gets checked before it -# can land. +# Part of the Nomad+Vault migration (S0.5, issue #825). Locks in the +# "no-ad-hoc-steps" principle: every HCL/shell artifact under nomad/ or +# lib/init/nomad/, plus the `disinto init` dispatcher, gets checked +# before it can land. # # Triggers on PRs (and pushes) that touch any of: # nomad/** — HCL configs (server, client, vault) -# lib/init/nomad/** — cluster-up / install / systemd / vault-init / -# vault-nomad-auth (S2.6 trigger: vault-*.sh -# is a subset of this glob) +# lib/init/nomad/** — cluster-up / install / systemd / vault-init # bin/disinto — `disinto init --backend=nomad` dispatcher # tests/disinto-init-nomad.bats — the bats suite itself -# vault/policies/** — Vault ACL policy HCL files (S2.1, S2.6) -# vault/roles.yaml — JWT-auth role bindings (S2.3, S2.6) # .woodpecker/nomad-validate.yml — the pipeline definition # # Steps (all fail-closed — any error blocks merge): @@ -24,22 +19,8 @@ # nomad/jobs/*.hcl (new jobspecs get # CI coverage automatically) # 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl -# 4. vault-policy-fmt — `vault policy fmt` idempotence check on -# every vault/policies/*.hcl (format drift = -# CI fail; non-destructive via cp+diff) -# 5. vault-policy-validate — HCL syntax + capability validation for every -# vault/policies/*.hcl via `vault policy write` -# against an inline dev-mode Vault server -# 6. vault-roles-validate — yamllint + role→policy reference check on -# vault/roles.yaml (every referenced policy -# must exist as vault/policies/.hcl) -# 7. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto -# 8. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests -# -# Secret-scan coverage: vault/policies/*.hcl is already scanned by the -# P11 gate (.woodpecker/secret-scan.yml, issue #798) — its trigger path -# `vault/**/*` covers everything under this directory. We intentionally -# do NOT duplicate that gate here; one scanner, one source of truth. +# 4. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto +# 5. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests # # Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 / # vault 1.18.5). Bump there AND here together — drift = CI passing on @@ -53,8 +34,6 @@ when: - "lib/init/nomad/**" - "bin/disinto" - "tests/disinto-init-nomad.bats" - - "vault/policies/**" - - "vault/roles.yaml" - ".woodpecker/nomad-validate.yml" # Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is @@ -144,176 +123,7 @@ steps: *) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;; esac - # ── 4. Vault policy fmt idempotence check ──────────────────────────────── - # `vault policy fmt ` formats a local HCL policy file in place. - # There's no `-check`/dry-run flag (vault 1.18.5), so we implement a - # non-destructive check as cp → fmt-on-copy → diff against original. - # Any diff means the committed file would be rewritten by `vault policy - # fmt` — failure steers the author to run `vault policy fmt ` - # locally before pushing. - # - # Scope: vault/policies/*.hcl only. The `[ -f "$f" ]` guard handles the - # no-match case (POSIX sh does not nullglob) so an empty policies/ - # directory does not fail this step. - # - # Note: `vault policy fmt` is purely local (HCL text transform) and does - # not require a running Vault server, which is why this step can run - # without starting one. - - name: vault-policy-fmt - image: hashicorp/vault:1.18.5 - commands: - - | - set -e - failed=0 - for f in vault/policies/*.hcl; do - [ -f "$f" ] || continue - tmp="/tmp/$(basename "$f").fmt" - cp "$f" "$tmp" - vault policy fmt "$tmp" >/dev/null 2>&1 - if ! diff -u "$f" "$tmp"; then - echo "ERROR: $f is not formatted — run 'vault policy fmt $f' locally" >&2 - failed=1 - fi - done - if [ "$failed" -gt 0 ]; then - echo "vault-policy-fmt: formatting drift detected" >&2 - exit 1 - fi - echo "vault-policy-fmt: all policies formatted correctly" - - # ── 5. Vault policy HCL syntax + capability validation ─────────────────── - # Vault has no offline `vault policy validate` subcommand — the closest - # in-CLI validator is `vault policy write`, which sends the HCL to a - # running server which parses it, checks capability names against the - # known set (read, list, create, update, delete, patch, sudo, deny), - # and rejects unknown stanzas / malformed path blocks. We start an - # inline dev-mode Vault (in-memory, no persistence, root token = "root") - # for the duration of this step and loop `vault policy write` over every - # vault/policies/*.hcl; the policies never leave the ephemeral dev - # server, so this is strictly a validator — not a deploy. - # - # Exit-code handling: - # - `vault policy write` exits 0 on success, non-zero on any parse / - # semantic error. We aggregate failures across all files so a single - # CI run surfaces every broken policy (not just the first). - # - The dev server is killed on any step exit via EXIT trap so the - # step tears down cleanly even on failure. - # - # Why dev-mode is sufficient: we're not persisting secrets, only asking - # Vault to parse policy text. The factory's production Vault is NOT - # contacted. - - name: vault-policy-validate - image: hashicorp/vault:1.18.5 - commands: - - | - set -e - vault server -dev -dev-root-token-id=root -dev-listen-address=127.0.0.1:8200 >/tmp/vault-dev.log 2>&1 & - VAULT_PID=$! - trap 'kill "$VAULT_PID" 2>/dev/null || true' EXIT INT TERM - export VAULT_ADDR=http://127.0.0.1:8200 - export VAULT_TOKEN=root - ready=0 - i=0 - while [ "$i" -lt 30 ]; do - if vault status >/dev/null 2>&1; then - ready=1 - break - fi - i=$((i + 1)) - sleep 0.5 - done - if [ "$ready" -ne 1 ]; then - echo "vault-policy-validate: dev server failed to start after 15s" >&2 - cat /tmp/vault-dev.log >&2 || true - exit 1 - fi - failed=0 - for f in vault/policies/*.hcl; do - [ -f "$f" ] || continue - name=$(basename "$f" .hcl) - echo "validate: $f" - if ! vault policy write "$name" "$f"; then - echo " ERROR: $f failed validation" >&2 - failed=1 - fi - done - if [ "$failed" -gt 0 ]; then - echo "vault-policy-validate: validation errors found" >&2 - exit 1 - fi - echo "vault-policy-validate: all policies valid" - - # ── 6. vault/roles.yaml validator ──────────────────────────────────────── - # Validates the JWT-auth role bindings file (S2.3). Two checks: - # - # a. `yamllint` — catches YAML syntax errors and indentation drift. - # Uses a relaxed config (line length bumped to 200) because - # roles.yaml's comments are wide by design. - # b. role → policy reference check — every role's `policy:` field - # must match a basename in vault/policies/*.hcl. A role pointing - # at a non-existent policy = runtime "permission denied" at job - # placement; catching the drift here turns it into a CI failure. - # Also verifies each role entry has the four required fields - # (name, policy, namespace, job_id) per the file's documented - # format. - # - # Parsing is done with PyYAML (the roles.yaml format is a strict - # subset that awk-level parsing in tools/vault-apply-roles.sh handles - # too, but PyYAML in CI gives us structural validation for free). If - # roles.yaml is ever absent (e.g. reverted), the step skips rather - # than fails — presence is enforced by S2.3's own tooling, not here. - - name: vault-roles-validate - image: python:3.12-alpine - commands: - - pip install --quiet --disable-pip-version-check pyyaml yamllint - - | - set -e - if [ ! -f vault/roles.yaml ]; then - echo "vault-roles-validate: vault/roles.yaml not present, skipping" - exit 0 - fi - yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}' vault/roles.yaml - echo "vault-roles-validate: yamllint OK" - python3 - <<'PY' - import os - import sys - import yaml - - with open('vault/roles.yaml') as f: - data = yaml.safe_load(f) or {} - roles = data.get('roles') or [] - if not roles: - print("vault-roles-validate: no roles defined in vault/roles.yaml", file=sys.stderr) - sys.exit(1) - existing = { - os.path.splitext(e)[0] - for e in os.listdir('vault/policies') - if e.endswith('.hcl') - } - required = ('name', 'policy', 'namespace', 'job_id') - failed = 0 - for r in roles: - if not isinstance(r, dict): - print(f"ERROR: role entry is not a mapping: {r!r}", file=sys.stderr) - failed = 1 - continue - for field in required: - if r.get(field) in (None, ''): - print(f"ERROR: role entry missing required field '{field}': {r}", file=sys.stderr) - failed = 1 - policy = r.get('policy') - if policy and policy not in existing: - print( - f"ERROR: role '{r.get('name')}' references policy '{policy}' " - f"but vault/policies/{policy}.hcl does not exist", - file=sys.stderr, - ) - failed = 1 - sys.exit(failed) - PY - echo "vault-roles-validate: all role→policy references valid" - - # ── 7. Shellcheck ──────────────────────────────────────────────────────── + # ── 4. Shellcheck ──────────────────────────────────────────────────────── # Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns # the backend dispatcher). bin/disinto has no .sh extension so the # repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the @@ -323,7 +133,7 @@ steps: commands: - shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto - # ── 8. bats: `disinto init --backend=nomad --dry-run` ──────────────────── + # ── 5. bats: `disinto init --backend=nomad --dry-run` ──────────────────── # Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0 # with the expected step list, and --backend=docker stays on the docker # path (regression guard). Pure dry-run — no sudo, no network. diff --git a/AGENTS.md b/AGENTS.md index 97634a4..eec058c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ - + # Disinto — Agent Instructions ## What this repo is @@ -37,20 +37,17 @@ disinto/ (code repo) │ examples/ — example vault action TOMLs (promote, publish, release, webhook-call) ├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh │ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure) -│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825); wp-oauth-register.sh (Forgejo OAuth2 app + Vault KV seeder for Woodpecker, S3.3); deploy.sh (dependency-ordered Nomad job deploy + health-wait, S4) -├── nomad/ server.hcl, client.hcl (allow_privileged for woodpecker-agent, S3-fix-5), vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh -│ jobs/ — Nomad jobspecs: forgejo.hcl (Vault secrets via template, S2.4); woodpecker-server.hcl + woodpecker-agent.hcl (host-net, docker.sock, Vault KV, S3.1-S3.2); agents.hcl (7 roles, llama, Vault-templated bot tokens, S4.1); vault-runner.hcl (parameterized batch dispatch, S5.3); staging.hcl (Caddy file-server, dynamic port — edge discovers via service registration, S5.2); chat.hcl (Claude chat UI, tmpfs via mount block, Vault OAuth secrets, S5.2); edge.hcl (Caddy proxy + dispatcher sidecar, S5.1) +│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825) +├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh ├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored) ├── formulas/ Issue templates (TOML specs for multi-step agent tasks) ├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/) ├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh) -│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh — Vault provisioning (S2.1/S2.2) -│ vault-seed-.sh — per-service Vault secret seeders; auto-invoked by `bin/disinto --with ` (add a new file to support a new service) ├── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md) ├── site/ disinto.ai website content -├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, lib-generators.bats, vault-import.bats, disinto-init-nomad.bats) +├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, disinto-init-nomad.bats) ├── templates/ Issue templates -├── bin/ The `disinto` CLI script (`--with ` deploys services + runs their Vault seeders) +├── bin/ The `disinto` CLI script ├── disinto-factory/ Setup documentation and skill ├── state/ Runtime state ├── .woodpecker/ Woodpecker CI pipeline configs @@ -123,7 +120,8 @@ bash dev/phase-test.sh | Reproduce | `docker/reproduce/` | Bug reproduction using Playwright MCP | `formulas/reproduce.toml` | | Triage | `docker/reproduce/` | Deep root cause analysis | `formulas/triage.toml` | | Edge dispatcher | `docker/edge/` | Polls ops repo for vault actions, executes via Claude sessions | `docker/edge/dispatcher.sh` | -| Local-model agents | `docker/agents/` (same image) | Local llama-server agents configured via `[agents.X]` sections in project TOML | [docs/agents-llama.md](docs/agents-llama.md) | +| agents-llama | `docker/agents/` (same image) | Local-Qwen dev agent (`AGENT_ROLES=dev`), gated on `ENABLE_LLAMA_AGENT=1` | [docs/agents-llama.md](docs/agents-llama.md) | +| agents-llama-all | `docker/agents/` (same image) | Local-Qwen all-roles agent (all 7 roles), profile `agents-llama-all` | [docs/agents-llama.md](docs/agents-llama.md) | > **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77). > See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow details. @@ -194,7 +192,9 @@ Humans write these. Agents read and enforce them. ## Phase-Signaling Protocol -When running as a persistent tmux session, Claude must signal the orchestrator at each phase boundary by writing to a phase file (e.g. `/tmp/dev-session-{project}-{issue}.phase`). +When running as a persistent tmux session, Claude must signal the orchestrator +at each phase boundary by writing to a phase file (e.g. +`/tmp/dev-session-{project}-{issue}.phase`). Key phases: `PHASE:awaiting_ci` → `PHASE:awaiting_review` → `PHASE:done`. Also: `PHASE:escalate` (needs human input), `PHASE:failed`. See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec, orchestrator reaction matrix, sequence diagram, and crash recovery. diff --git a/architect/AGENTS.md b/architect/AGENTS.md index 61987ae..9582b03 100644 --- a/architect/AGENTS.md +++ b/architect/AGENTS.md @@ -1,4 +1,4 @@ - + # Architect — Agent Instructions ## What this agent is diff --git a/bin/disinto b/bin/disinto index 7f6379d..b86249f 100755 --- a/bin/disinto +++ b/bin/disinto @@ -82,16 +82,16 @@ Init options: --ci-id Woodpecker CI repo ID (default: 0 = no CI) --forge-url Forge base URL (default: http://localhost:3000) --backend Orchestration backend: docker (default) | nomad - --with (nomad) Deploy services: forgejo,woodpecker,agents,staging,chat,edge[,...] (S1.3, S3.4, S4.2, S5.2, S5.5) + --with (nomad) Deploy services: forgejo[,...] (S1.3) --empty (nomad) Bring up cluster only, no jobs (S0.4) --bare Skip compose generation (bare-metal setup) --build Use local docker build instead of registry images (dev mode) --yes Skip confirmation prompts --rotate-tokens Force regeneration of all bot tokens/passwords (idempotent by default) --dry-run Print every intended action without executing - --import-env (nomad) Path to .env file for import into Vault KV (S2.5) - --import-sops (nomad) Path to sops-encrypted .env.vault.enc for import (S2.5) - --age-key (nomad) Path to age keyfile (required with --import-sops) (S2.5) + --import-env (nomad) Path to .env file for import into Vault KV + --import-sops (nomad) Path to sops-encrypted .env.vault.enc for import + --age-key (nomad) Path to age keyfile (required with --import-sops) Hire an agent options: --formula Path to role formula TOML (default: formulas/.toml) @@ -670,10 +670,9 @@ _disinto_init_nomad() { local import_env="${4:-}" import_sops="${5:-}" age_key="${6:-}" local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh" local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh" - local vault_engines_sh="${FACTORY_ROOT}/lib/init/nomad/vault-engines.sh" - local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh" - local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh" local vault_import_sh="${FACTORY_ROOT}/tools/vault-import.sh" + local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh" + local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh" if [ ! -x "$cluster_up" ]; then echo "Error: ${cluster_up} not found or not executable" >&2 @@ -685,42 +684,6 @@ _disinto_init_nomad() { exit 1 fi - # --empty short-circuits after cluster-up: no policies, no auth, no - # import, no deploy. It's the "cluster-only escape hatch" for debugging - # (docs/nomad-migration.md). Caller-side validation already rejects - # --empty combined with --with or any --import-* flag, so reaching - # this branch with those set is a bug in the caller. - # - # On the default (non-empty) path, vault-engines.sh (enables the kv/ - # mount), vault-apply-policies.sh, and vault-nomad-auth.sh are invoked - # unconditionally — they are idempotent and cheap to re-run, and - # subsequent --with deployments depend on them. vault-import.sh is - # invoked only when an --import-* flag is set. vault-engines.sh runs - # first because every policy and role below references kv/disinto/* - # paths, which 403 if the engine is not yet mounted (issue #912). - local import_any=false - if [ -n "$import_env" ] || [ -n "$import_sops" ]; then - import_any=true - fi - if [ "$empty" != "true" ]; then - if [ ! -x "$vault_engines_sh" ]; then - echo "Error: ${vault_engines_sh} not found or not executable" >&2 - exit 1 - fi - if [ ! -x "$vault_policies_sh" ]; then - echo "Error: ${vault_policies_sh} not found or not executable" >&2 - exit 1 - fi - if [ ! -x "$vault_auth_sh" ]; then - echo "Error: ${vault_auth_sh} not found or not executable" >&2 - exit 1 - fi - if [ "$import_any" = true ] && [ ! -x "$vault_import_sh" ]; then - echo "Error: ${vault_import_sh} not found or not executable" >&2 - exit 1 - fi - fi - # --empty and default both invoke cluster-up today. Log the requested # mode so the dispatch is visible in factory bootstrap logs — Step 1 # will branch on $empty to gate the job-deployment path. @@ -730,7 +693,7 @@ _disinto_init_nomad() { echo "nomad backend: default (cluster-up; jobs deferred to Step 1)" fi - # Dry-run: print cluster-up plan + policies/auth/import plan + deploy.sh plan + # Dry-run: print cluster-up plan + import plan + deploy.sh plan if [ "$dry_run" = "true" ]; then echo "" echo "── Cluster-up dry-run ─────────────────────────────────" @@ -738,82 +701,46 @@ _disinto_init_nomad() { "${cmd[@]}" || true echo "" - # --empty skips policies/auth/import/deploy — cluster-up only, no - # workloads. The operator-visible dry-run plan must match the real - # run, so short-circuit here too. - if [ "$empty" = "true" ]; then - exit 0 - fi - - # Vault engines + policies + auth are invoked on every nomad real-run - # path regardless of --import-* flags (they're idempotent; S2.1 + S2.3). - # Engines runs first because policies/roles/templates all reference the - # kv/ mount it enables (issue #912). Mirror that ordering in the - # dry-run plan so the operator sees the full sequence Step 2 will - # execute. - echo "── Vault engines dry-run ──────────────────────────────" - echo "[engines] [dry-run] ${vault_engines_sh} --dry-run" - echo "" - echo "── Vault policies dry-run ─────────────────────────────" - echo "[policies] [dry-run] ${vault_policies_sh} --dry-run" - echo "" - echo "── Vault auth dry-run ─────────────────────────────────" - echo "[auth] [dry-run] ${vault_auth_sh}" - echo "" - - # Import plan: one line per --import-* flag that is actually set. - # Printing independently (not in an if/elif chain) means that all - # three flags appearing together each echo their own path — the - # regression that bit prior implementations of this issue (#883). - if [ "$import_any" = true ]; then + # Import plan if any import flags are set + if [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; then echo "── Vault import dry-run ───────────────────────────────" - [ -n "$import_env" ] && echo "[import] --import-env env file: ${import_env}" - [ -n "$import_sops" ] && echo "[import] --import-sops sops file: ${import_sops}" - [ -n "$age_key" ] && echo "[import] --age-key age key: ${age_key}" - local -a import_dry_cmd=("$vault_import_sh") - [ -n "$import_env" ] && import_dry_cmd+=("--env" "$import_env") - [ -n "$import_sops" ] && import_dry_cmd+=("--sops" "$import_sops") - [ -n "$age_key" ] && import_dry_cmd+=("--age-key" "$age_key") - import_dry_cmd+=("--dry-run") - echo "[import] [dry-run] ${import_dry_cmd[*]}" + if [ -n "$import_env" ]; then + echo "[import] env file: ${import_env}" + fi + if [ -n "$import_sops" ]; then + echo "[import] sops file: ${import_sops}" + fi + if [ -n "$age_key" ]; then + echo "[import] age key: ${age_key}" + fi + echo "[import] [dry-run] ${vault_import_sh} --dry-run" + echo "[import] [dry-run] vault import plan printed above" + echo "" + echo "── Vault policies dry-run ─────────────────────────────" + echo "[policies] [dry-run] ${vault_policies_sh} --dry-run" + echo "" + echo "── Vault auth dry-run ─────────────────────────────────" + echo "[auth] [dry-run] ${vault_auth_sh}" echo "" else - echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" + echo "[import] no --import-env/--import-sops - skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" echo "" fi if [ -n "$with_services" ]; then - # Interleaved seed/deploy per service (S2.6, #928, #948): match the - # real-run path so dry-run output accurately represents execution order. - # Build ordered deploy list: only include services present in with_services - local DEPLOY_ORDER="" - for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do - if echo ",$with_services," | grep -q ",$ordered_svc,"; then - DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" - fi - done - - local IFS=' ' - echo "[deploy] deployment order: ${DEPLOY_ORDER}" - for svc in $DEPLOY_ORDER; do - # Seed this service (if seed script exists) - local seed_name="$svc" + echo "── Deploy services dry-run ────────────────────────────" + echo "[deploy] services to deploy: ${with_services}" + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + # Validate known services first case "$svc" in - woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; - agents) seed_name="agents" ;; - chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; + forgejo) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo" >&2 + exit 1 + ;; esac - local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" - if [ -x "$seed_script" ]; then - echo "── Vault seed dry-run ─────────────────────────────────" - echo "[seed] [dry-run] ${seed_script} --dry-run" - echo "" - fi - - # Deploy this service - echo "── Deploy services dry-run ────────────────────────────" - echo "[deploy] services to deploy: ${with_services}" local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 @@ -824,36 +751,10 @@ _disinto_init_nomad() { done echo "[deploy] dry-run complete" fi - - # Dry-run vault-runner (unconditionally, not gated by --with) - echo "" - echo "── Vault-runner dry-run ───────────────────────────────────" - local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" - if [ -f "$vault_runner_path" ]; then - echo "[deploy] vault-runner: [dry-run] nomad job validate ${vault_runner_path}" - echo "[deploy] vault-runner: [dry-run] nomad job run -detach ${vault_runner_path}" - else - echo "[deploy] vault-runner: jobspec not found, skipping" - fi - - # Build custom images dry-run (if agents, chat, or edge services are included) - if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then - echo "" - echo "── Build images dry-run ──────────────────────────────" - if echo ",$with_services," | grep -q ",agents,"; then - echo "[build] [dry-run] docker build -t disinto/agents:local -f ${FACTORY_ROOT}/docker/agents/Dockerfile ${FACTORY_ROOT}" - fi - if echo ",$with_services," | grep -q ",chat,"; then - echo "[build] [dry-run] docker build -t disinto/chat:local -f ${FACTORY_ROOT}/docker/chat/Dockerfile ${FACTORY_ROOT}/docker/chat" - fi - if echo ",$with_services," | grep -q ",edge,"; then - echo "[build] [dry-run] docker build -t disinto/edge:local -f ${FACTORY_ROOT}/docker/edge/Dockerfile ${FACTORY_ROOT}/docker/edge" - fi - fi exit 0 fi - # Real run: cluster-up + policies + auth + (optional) import + deploy + # Real run: cluster-up + import + deploy services local -a cluster_cmd=("$cluster_up") if [ "$(id -u)" -eq 0 ]; then "${cluster_cmd[@]}" || exit $? @@ -865,64 +766,48 @@ _disinto_init_nomad() { sudo -n -- "${cluster_cmd[@]}" || exit $? fi - # --empty short-circuits here: cluster-up only, no policies/auth/import - # and no deploy. Matches the dry-run plan above and the docs/runbook. - if [ "$empty" = "true" ]; then - exit 0 - fi - - # Enable Vault secret engines (S2.1 / issue #912) — must precede - # policies/auth/import because every policy and every import target - # addresses paths under kv/. Idempotent, safe to re-run. + # Apply Vault policies (S2.1) echo "" - echo "── Enabling Vault secret engines ──────────────────────" - local -a engines_cmd=("$vault_engines_sh") + echo "── Applying Vault policies ─────────────────────────────" if [ "$(id -u)" -eq 0 ]; then - "${engines_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-engines.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${engines_cmd[@]}" || exit $? - fi - - # Apply Vault policies (S2.1) — idempotent, safe to re-run. - echo "" - echo "── Applying Vault policies ────────────────────────────" - local -a policies_cmd=("$vault_policies_sh") - if [ "$(id -u)" -eq 0 ]; then - "${policies_cmd[@]}" || exit $? + "${vault_policies_sh}" || exit $? else if ! command -v sudo >/dev/null 2>&1; then echo "Error: vault-apply-policies.sh must run as root and sudo is not installed" >&2 exit 1 fi - sudo -n -- "${policies_cmd[@]}" || exit $? + sudo -n -- "${vault_policies_sh}" || exit $? fi - # Configure Vault JWT auth + Nomad workload identity (S2.3) — idempotent. + # Configure Vault JWT auth (S2.3) echo "" - echo "── Configuring Vault JWT auth ─────────────────────────" - local -a auth_cmd=("$vault_auth_sh") + echo "── Configuring Vault JWT auth ──────────────────────────" if [ "$(id -u)" -eq 0 ]; then - "${auth_cmd[@]}" || exit $? + "${vault_auth_sh}" || exit $? else if ! command -v sudo >/dev/null 2>&1; then echo "Error: vault-nomad-auth.sh must run as root and sudo is not installed" >&2 exit 1 fi - sudo -n -- "${auth_cmd[@]}" || exit $? + sudo -n -- "${vault_auth_sh}" || exit $? fi - # Import secrets if any --import-* flag is set (S2.2). - if [ "$import_any" = true ]; then + # Import secrets if import flags are set (S2.2) + if [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; then echo "" - echo "── Importing secrets into Vault ───────────────────────" + echo "── Importing secrets into Vault ────────────────────────" local -a import_cmd=("$vault_import_sh") - [ -n "$import_env" ] && import_cmd+=("--env" "$import_env") - [ -n "$import_sops" ] && import_cmd+=("--sops" "$import_sops") - [ -n "$age_key" ] && import_cmd+=("--age-key" "$age_key") + + if [ -n "$import_env" ]; then + import_cmd+=("--env" "$import_env") + fi + if [ -n "$import_sops" ]; then + import_cmd+=("--sops" "$import_sops") + fi + if [ -n "$age_key" ]; then + import_cmd+=("--age-key" "$age_key") + fi + if [ "$(id -u)" -eq 0 ]; then "${import_cmd[@]}" || exit $? else @@ -933,166 +818,62 @@ _disinto_init_nomad() { sudo -n -- "${import_cmd[@]}" || exit $? fi else - echo "" - echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" + echo "[import] no --import-env/--import-sops - skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" fi - # Build custom images required by Nomad jobs (S4.2, S5.2, S5.5) — before deploy. - # Single-node factory dev box: no multi-node pull needed, no registry auth. - # Can upgrade to approach B (registry push/pull) later if multi-node. - if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then - echo "" - echo "── Building custom images ─────────────────────────────" - if echo ",$with_services," | grep -q ",agents,"; then - local tag="disinto/agents:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/agents/Dockerfile" "${FACTORY_ROOT}" 2>&1 | tail -5 - fi - if echo ",$with_services," | grep -q ",chat,"; then - local tag="disinto/chat:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/chat/Dockerfile" "${FACTORY_ROOT}/docker/chat" 2>&1 | tail -5 - fi - if echo ",$with_services," | grep -q ",edge,"; then - local tag="disinto/edge:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/edge/Dockerfile" "${FACTORY_ROOT}/docker/edge" 2>&1 | tail -5 - fi - fi - - # Interleaved seed/deploy per service (S2.6, #928, #948). - # We interleave seed + deploy per service (not batch all seeds then all deploys) - # so that OAuth-dependent services can reach their dependencies during seeding. - # E.g., seed-forgejo → deploy-forgejo → seed-woodpecker (OAuth can now reach - # running forgejo) → deploy-woodpecker. + # Deploy services if requested if [ -n "$with_services" ]; then - local vault_addr="${VAULT_ADDR:-http://127.0.0.1:8200}" - - # Build ordered deploy list (S3.4, S4.2, S5.2, S5.5): forgejo → woodpecker-server → woodpecker-agent → agents → staging → chat → edge - local DEPLOY_ORDER="" - for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do - if echo ",$with_services," | grep -q ",$ordered_svc,"; then - DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" + echo "" + echo "── Deploying services ─────────────────────────────────" + local -a deploy_cmd=("$deploy_sh") + # Split comma-separated service list into positional args + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then + echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2 + exit 1 fi - done - - local IFS=' ' - for svc in $DEPLOY_ORDER; do - # Seed this service (if seed script exists) - local seed_name="$svc" + # Validate known services FIRST (before jobspec check) case "$svc" in - woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; - agents) seed_name="agents" ;; - chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; - esac - local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" - if [ -x "$seed_script" ]; then - echo "" - echo "── Seeding Vault for ${seed_name} ───────────────────────────" - if [ "$(id -u)" -eq 0 ]; then - VAULT_ADDR="$vault_addr" "$seed_script" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-seed-${seed_name}.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- env "VAULT_ADDR=$vault_addr" "$seed_script" || exit $? - fi - fi - - # Deploy this service - echo "" - echo "── Deploying ${svc} ───────────────────────────────────────" - - # Seed host volumes before deployment (if needed) - case "$svc" in - staging) - # Seed site-content host volume (/srv/disinto/docker) with static content - # The staging jobspec mounts this volume read-only to /srv/site - local site_content_src="${FACTORY_ROOT}/docker/index.html" - local site_content_dst="/srv/disinto/docker" - if [ -f "$site_content_src" ] && [ -d "$site_content_dst" ]; then - if ! cmp -s "$site_content_src" "${site_content_dst}/index.html" 2>/dev/null; then - echo "[staging] seeding site-content volume..." - cp "$site_content_src" "${site_content_dst}/index.html" - fi - fi + forgejo) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo" >&2 + exit 1 ;; esac - + # Check jobspec exists local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 exit 1 fi - - local -a deploy_cmd=("$deploy_sh" "$svc") - if [ "$(id -u)" -eq 0 ]; then - "${deploy_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${deploy_cmd[@]}" || exit $? - fi + deploy_cmd+=("$svc") done - # Run vault-runner (unconditionally, not gated by --with) — infrastructure job - # vault-runner is always present since it's needed for vault action dispatch - echo "" - echo "── Running vault-runner ────────────────────────────────────" - local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" - if [ -f "$vault_runner_path" ]; then - echo "[deploy] vault-runner: running Nomad job (infrastructure)" - local -a vault_runner_cmd=("$deploy_sh" "vault-runner") - if [ "$(id -u)" -eq 0 ]; then - "${vault_runner_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${vault_runner_cmd[@]}" || exit $? - fi + if [ "$(id -u)" -eq 0 ]; then + "${deploy_cmd[@]}" || exit $? else - echo "[deploy] vault-runner: jobspec not found, skipping" + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: deploy.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${deploy_cmd[@]}" || exit $? fi # Print final summary echo "" echo "── Summary ────────────────────────────────────────────" echo "Cluster: Nomad+Vault cluster is up" - echo "Policies: applied (Vault ACL)" - echo "Auth: Vault JWT auth + Nomad workload identity configured" - if [ "$import_any" = true ]; then - local import_desc="" - [ -n "$import_env" ] && import_desc+="${import_env} " - [ -n "$import_sops" ] && import_desc+="${import_sops} " - echo "Imported: ${import_desc% }" + if [ -n "$import_env" ] || [ -n "$import_sops" ]; then + echo "Imported: secrets from ${import_env:+$import_env }${import_sops:+${import_sops} }" else - echo "Imported: (none — seed kv/disinto/* manually before deploying secret-dependent services)" + echo "Imported: (none — secrets must be seeded manually)" fi echo "Deployed: ${with_services}" - if echo ",$with_services," | grep -q ",forgejo,"; then + if echo "$with_services" | grep -q "forgejo"; then echo "Ports: forgejo: 3000" fi - if echo ",$with_services," | grep -q ",woodpecker-server,"; then - echo " woodpecker-server: 8000" - fi - if echo ",$with_services," | grep -q ",woodpecker-agent,"; then - echo " woodpecker-agent: (agent connected)" - fi - if echo ",$with_services," | grep -q ",agents,"; then - echo " agents: (polling loop running)" - fi - if echo ",$with_services," | grep -q ",staging,"; then - echo " staging: (internal, no external port)" - fi - if echo ",$with_services," | grep -q ",chat,"; then - echo " chat: 8080" - fi echo "────────────────────────────────────────────────────────" fi @@ -1114,8 +895,7 @@ disinto_init() { fi # Parse flags - local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services="" - local import_env="" import_sops="" age_key="" + local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services="" import_env="" import_sops="" age_key="" while [ $# -gt 0 ]; do case "$1" in --branch) branch="$2"; shift 2 ;; @@ -1133,11 +913,8 @@ disinto_init() { --rotate-tokens) rotate_tokens=true; shift ;; --dry-run) dry_run=true; shift ;; --import-env) import_env="$2"; shift 2 ;; - --import-env=*) import_env="${1#--import-env=}"; shift ;; --import-sops) import_sops="$2"; shift 2 ;; - --import-sops=*) import_sops="${1#--import-sops=}"; shift ;; --age-key) age_key="$2"; shift 2 ;; - --age-key=*) age_key="${1#--age-key=}"; shift ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac done @@ -1178,104 +955,32 @@ disinto_init() { exit 1 fi - # Normalize --with services (S3.4): expand 'woodpecker' shorthand to - # 'woodpecker-server,woodpecker-agent', auto-include forgejo when - # woodpecker is requested (OAuth dependency), and validate all names. - if [ -n "$with_services" ]; then - # Expand 'woodpecker' (bare) → 'woodpecker-server,woodpecker-agent'. - # Must not match already-expanded 'woodpecker-server'/'woodpecker-agent'. - local expanded="" - local IFS=',' - for _svc in $with_services; do - _svc=$(echo "$_svc" | xargs) - case "$_svc" in - woodpecker) _svc="woodpecker-server,woodpecker-agent" ;; - agents) _svc="agents" ;; - esac - expanded="${expanded:+${expanded},}${_svc}" - done - with_services="$expanded" - unset IFS - - # Auto-include forgejo when woodpecker is requested - if echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent," \ - && ! echo ",$with_services," | grep -q ",forgejo,"; then - echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)" - with_services="forgejo,${with_services}" - fi - - # Auto-include forgejo and woodpecker when agents is requested - if echo ",$with_services," | grep -q ",agents,"; then - if ! echo ",$with_services," | grep -q ",forgejo,"; then - echo "Note: --with agents implies --with forgejo (agents need forge)" - with_services="forgejo,${with_services}" - fi - if ! echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent,"; then - echo "Note: --with agents implies --with woodpecker (agents need CI)" - with_services="${with_services},woodpecker-server,woodpecker-agent" - fi - fi - - # Auto-include all dependencies when edge is requested (S5.5) - if echo ",$with_services," | grep -q ",edge,"; then - # Edge depends on all backend services - for dep in forgejo woodpecker-server woodpecker-agent agents staging chat; do - if ! echo ",$with_services," | grep -q ",${dep},"; then - echo "Note: --with edge implies --with ${dep} (edge depends on all backend services)" - with_services="${with_services},${dep}" - fi - done - fi - - # Validate all service names are known - local IFS=',' - for _svc in $with_services; do - _svc=$(echo "$_svc" | xargs) - case "$_svc" in - forgejo|woodpecker-server|woodpecker-agent|agents|staging|chat|edge) ;; - *) - echo "Error: unknown service '${_svc}' — known: forgejo, woodpecker-server, woodpecker-agent, agents, staging, chat, edge" >&2 - exit 1 - ;; - esac - done - unset IFS - fi - - # --import-* flag validation (S2.5). These three flags form an import - # triple and must be consistent before dispatch: sops encryption is - # useless without the age key to decrypt it, so either both --import-sops - # and --age-key are present or neither is. --import-env alone is fine - # (it just imports the plaintext dotenv). All three flags are nomad-only. + # Import flags validation + # --import-sops requires --age-key if [ -n "$import_sops" ] && [ -z "$age_key" ]; then echo "Error: --import-sops requires --age-key" >&2 exit 1 fi + + # --age-key requires --import-sops if [ -n "$age_key" ] && [ -z "$import_sops" ]; then echo "Error: --age-key requires --import-sops" >&2 exit 1 fi - if { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; } \ - && [ "$backend" != "nomad" ]; then - echo "Error: --import-env, --import-sops, and --age-key require --backend=nomad" >&2 - exit 1 - fi - # --empty is the cluster-only escape hatch — it skips policies, auth, - # import, and deploy. Pairing it with --import-* silently does nothing, - # which is a worse failure mode than a clear error. Reject explicitly. - if [ "$empty" = true ] \ - && { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; }; then - echo "Error: --empty and --import-env/--import-sops/--age-key are mutually exclusive" >&2 - exit 1 + # --import-* flags require --backend=nomad + if [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; then + if [ "$backend" != "nomad" ]; then + echo "Error: --import-env, --import-sops, and --age-key require --backend=nomad" >&2 + exit 1 + fi fi # Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh # (S0.4). The default and --empty variants are identical today; Step 1 # will branch on $empty to add job deployment to the default path. if [ "$backend" = "nomad" ]; then - _disinto_init_nomad "$dry_run" "$empty" "$with_services" \ - "$import_env" "$import_sops" "$age_key" + _disinto_init_nomad "$dry_run" "$empty" "$with_services" "$import_env" "$import_sops" "$age_key" # shellcheck disable=SC2317 # _disinto_init_nomad always exits today; # `return` is defensive against future refactors. return @@ -1389,6 +1094,7 @@ p.write_text(text) echo "" echo "[ensure] Forgejo admin user 'disinto-admin'" echo "[ensure] 8 bot users: dev-bot, review-bot, planner-bot, gardener-bot, vault-bot, supervisor-bot, predictor-bot, architect-bot" + echo "[ensure] 2 llama bot users: dev-qwen, dev-qwen-nightly" echo "[ensure] .profile repos for all bots" echo "[ensure] repo ${forge_repo} on Forgejo with collaborators" echo "[run] preflight checks" @@ -1428,7 +1134,7 @@ p.write_text(text) echo "[ensure] CLAUDE_CONFIG_DIR" echo "[ensure] state files (.dev-active, .reviewer-active, .gardener-active)" echo "" - echo "Dry run complete — no changes made." + echo "Dry run complete - no changes made." exit 0 fi @@ -1584,6 +1290,19 @@ p.write_text(text) echo "Config: CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 saved to .env" fi + # Write local-Qwen dev agent env keys with safe defaults (#769) + if ! grep -q '^ENABLE_LLAMA_AGENT=' "$env_file" 2>/dev/null; then + cat >> "$env_file" <<'LLAMAENVEOF' + +# Local Qwen dev agent (optional) — set to 1 to enable +ENABLE_LLAMA_AGENT=0 +FORGE_TOKEN_LLAMA= +FORGE_PASS_LLAMA= +ANTHROPIC_BASE_URL= +LLAMAENVEOF + echo "Config: ENABLE_LLAMA_AGENT keys written to .env (disabled by default)" + fi + # Create labels on remote create_labels "$forge_repo" "$forge_url" diff --git a/dev/AGENTS.md b/dev/AGENTS.md index 5e6f085..481bb1f 100644 --- a/dev/AGENTS.md +++ b/dev/AGENTS.md @@ -1,4 +1,4 @@ - + # Dev Agent **Role**: Implement issues autonomously — write code, push branches, address diff --git a/docker-compose.yml b/docker-compose.yml index c4676f2..ba8c77c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,6 +15,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -77,6 +78,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -137,6 +139,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro diff --git a/docker/agents/Dockerfile b/docker/agents/Dockerfile index fa3b2d8..1bcba89 100644 --- a/docker/agents/Dockerfile +++ b/docker/agents/Dockerfile @@ -1,26 +1,21 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ - bash curl git jq tmux nodejs npm python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ + bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ && pip3 install --break-system-packages networkx tomlkit \ && rm -rf /var/lib/apt/lists/* # Pre-built binaries (copied from docker/agents/bin/) # SOPS — encrypted data decryption tool -# Download sops binary (replaces manual COPY of vendored binary) -ARG SOPS_VERSION=3.9.4 -RUN curl -fsSL "https://github.com/getsops/sops/releases/download/v${SOPS_VERSION}/sops-v${SOPS_VERSION}.linux.amd64" \ - -o /usr/local/bin/sops && chmod +x /usr/local/bin/sops +COPY docker/agents/bin/sops /usr/local/bin/sops +RUN chmod +x /usr/local/bin/sops # tea CLI — official Gitea/Forgejo CLI for issue/label/comment operations -# Download tea binary (replaces manual COPY of vendored binary) -ARG TEA_VERSION=0.9.2 -RUN curl -fsSL "https://dl.gitea.com/tea/${TEA_VERSION}/tea-${TEA_VERSION}-linux-amd64" \ - -o /usr/local/bin/tea && chmod +x /usr/local/bin/tea +COPY docker/agents/bin/tea /usr/local/bin/tea +RUN chmod +x /usr/local/bin/tea -# Install Claude Code CLI — agent runtime for all LLM backends (llama, Claude API). -# The CLI is the execution environment; ANTHROPIC_BASE_URL selects the model provider. -RUN npm install -g @anthropic-ai/claude-code@2.1.84 +# Claude CLI is mounted from the host via docker-compose volume. +# No internet access to cli.anthropic.com required at build time. # Non-root user RUN useradd -m -u 1000 -s /bin/bash agent diff --git a/docker/agents/entrypoint.sh b/docker/agents/entrypoint.sh index 7c58674..89a520b 100644 --- a/docker/agents/entrypoint.sh +++ b/docker/agents/entrypoint.sh @@ -17,38 +17,6 @@ set -euo pipefail # - predictor: every 24 hours (288 iterations * 5 min) # - supervisor: every SUPERVISOR_INTERVAL seconds (default: 1200 = 20 min) -# ── Migration check: reject ENABLE_LLAMA_AGENT ─────────────────────────────── -# #846: The legacy ENABLE_LLAMA_AGENT env flag is no longer supported. -# Activation is now done exclusively via [agents.X] sections in project TOML. -# If this legacy flag is detected, fail immediately with a migration message. -if [ "${ENABLE_LLAMA_AGENT:-}" = "1" ]; then - cat <<'MIGRATION_ERR' -FATAL: ENABLE_LLAMA_AGENT is no longer supported. - -The legacy ENABLE_LLAMA_AGENT=1 flag has been removed (#846). -Activation is now done exclusively via [agents.X] sections in projects/*.toml. - -To migrate: - 1. Remove ENABLE_LLAMA_AGENT from your .env or .env.enc file - 2. Add an [agents.] section to your project TOML: - - [agents.dev-qwen] - base_url = "http://your-llama-server:8081" - model = "unsloth/Qwen3.5-35B-A3B" - api_key = "sk-no-key-required" - roles = ["dev"] - forge_user = "dev-qwen" - compact_pct = 60 - poll_interval = 60 - - 3. Run: disinto init - 4. Start the agent: docker compose up -d agents-dev-qwen - -See docs/agents-llama.md for full details. -MIGRATION_ERR - exit 1 -fi - DISINTO_BAKED="/home/agent/disinto" DISINTO_LIVE="/home/agent/repos/_factory" DISINTO_DIR="$DISINTO_BAKED" # start with baked copy; switched to live checkout after bootstrap @@ -378,19 +346,15 @@ bootstrap_factory_repo # This prevents the silent-zombie mode where the polling loop matches zero files # and does nothing forever. validate_projects_dir() { - # NOTE: compgen -G exits non-zero when no matches exist, so piping it through - # `wc -l` under `set -eo pipefail` aborts the script before the FATAL branch - # can log a diagnostic (#877). Use the conditional form already adopted at - # lines above (see bootstrap_factory_repo, PROJECT_NAME parsing). - if ! compgen -G "${DISINTO_DIR}/projects/*.toml" >/dev/null 2>&1; then + local toml_count + toml_count=$(compgen -G "${DISINTO_DIR}/projects/*.toml" 2>/dev/null | wc -l) + if [ "$toml_count" -eq 0 ]; then log "FATAL: No real .toml files found in ${DISINTO_DIR}/projects/" log "Expected at least one project config file (e.g., disinto.toml)" log "The directory only contains *.toml.example template files." log "Mount the host ./projects volume or copy real .toml files into the container." exit 1 fi - local toml_count - toml_count=$(compgen -G "${DISINTO_DIR}/projects/*.toml" | wc -l) log "Projects directory validated: ${toml_count} real .toml file(s) found" } diff --git a/docker/chat/Dockerfile b/docker/chat/Dockerfile index c4cb28b..3d89863 100644 --- a/docker/chat/Dockerfile +++ b/docker/chat/Dockerfile @@ -1,22 +1,20 @@ # disinto-chat — minimal HTTP backend for Claude chat UI # -# Small Debian slim base with Python runtime and Node.js. +# Small Debian slim base with Python runtime. # Chosen for simplicity and small image size (~100MB). # # Image size: ~100MB (well under the 200MB ceiling) # -# Claude CLI is baked into the image — same pattern as the agents container. +# The claude binary is mounted from the host at runtime via docker-compose, +# not baked into the image — same pattern as the agents container. FROM debian:bookworm-slim -# Install Node.js (required for Claude CLI) and Python +# Install Python (no build-time network access needed) RUN apt-get update && apt-get install -y --no-install-recommends \ - nodejs npm python3 \ + python3 \ && rm -rf /var/lib/apt/lists/* -# Install Claude Code CLI — chat backend runtime -RUN npm install -g @anthropic-ai/claude-code@2.1.84 - # Non-root user — fixed UID 10001 for sandbox hardening (#706) RUN useradd -m -u 10001 -s /bin/bash chat diff --git a/docker/edge/dispatcher.sh b/docker/edge/dispatcher.sh index 282342a..a48abf2 100755 --- a/docker/edge/dispatcher.sh +++ b/docker/edge/dispatcher.sh @@ -560,168 +560,10 @@ _launch_runner_docker() { # _launch_runner_nomad ACTION_ID SECRETS_CSV MOUNTS_CSV # -# Dispatches a vault-runner batch job via `nomad job dispatch`. -# Polls `nomad job status` until terminal state (completed/failed). -# Reads exit code from allocation and writes .result.json. -# -# Usage: _launch_runner_nomad -# Returns: exit code of the nomad job (0=success, non-zero=failure) +# Nomad backend stub — will be implemented in migration Step 5. _launch_runner_nomad() { - local action_id="$1" - local secrets_csv="$2" - local mounts_csv="$3" - - log "Dispatching vault-runner batch job via Nomad for action: ${action_id}" - - # Dispatch the parameterized batch job - # The vault-runner job expects meta: action_id, secrets_csv - # Note: mounts_csv is not passed as meta (not declared in vault-runner.hcl) - local dispatch_output - dispatch_output=$(nomad job dispatch \ - -detach \ - -meta action_id="$action_id" \ - -meta secrets_csv="$secrets_csv" \ - vault-runner 2>&1) || { - log "ERROR: Failed to dispatch vault-runner job for ${action_id}" - log "Dispatch output: ${dispatch_output}" - write_result "$action_id" 1 "Nomad dispatch failed: ${dispatch_output}" - return 1 - } - - # Extract dispatched job ID from output (format: "vault-runner/dispatch--") - local dispatched_job_id - dispatched_job_id=$(echo "$dispatch_output" | grep -oP '(?<=Dispatched Job ID = ).+' || true) - - if [ -z "$dispatched_job_id" ]; then - log "ERROR: Could not extract dispatched job ID from nomad output" - log "Dispatch output: ${dispatch_output}" - write_result "$action_id" 1 "Could not extract dispatched job ID from nomad output" - return 1 - fi - - log "Dispatched vault-runner with job ID: ${dispatched_job_id}" - - # Poll job status until terminal state - # Batch jobs transition: running -> completed/failed - local max_wait=300 # 5 minutes max wait - local elapsed=0 - local poll_interval=5 - local alloc_id="" - - log "Polling nomad job status for ${dispatched_job_id}..." - - while [ "$elapsed" -lt "$max_wait" ]; do - # Get job status with JSON output for the dispatched child job - local job_status_json - job_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { - log "ERROR: Failed to get job status for ${dispatched_job_id}" - write_result "$action_id" 1 "Failed to get job status for ${dispatched_job_id}" - return 1 - } - - # Check job status field (transitions to "dead" on completion) - local job_state - job_state=$(echo "$job_status_json" | jq -r '.Status // empty' 2>/dev/null) || job_state="" - - # Check allocation state directly - alloc_id=$(echo "$job_status_json" | jq -r '.Allocations[0]?.ID // empty' 2>/dev/null) || alloc_id="" - - if [ -n "$alloc_id" ]; then - local alloc_state - alloc_state=$(nomad alloc status -short "$alloc_id" 2>/dev/null || true) - - case "$alloc_state" in - *completed*|*success*|*dead*) - log "Allocation ${alloc_id} reached terminal state: ${alloc_state}" - break - ;; - *running*|*pending*|*starting*) - log "Allocation ${alloc_id} still running (state: ${alloc_state})..." - ;; - *failed*|*crashed*) - log "Allocation ${alloc_id} failed (state: ${alloc_state})" - break - ;; - esac - fi - - # Also check job-level state - case "$job_state" in - dead) - log "Job ${dispatched_job_id} reached terminal state: ${job_state}" - break - ;; - failed) - log "Job ${dispatched_job_id} failed" - break - ;; - esac - - sleep "$poll_interval" - elapsed=$((elapsed + poll_interval)) - done - - if [ "$elapsed" -ge "$max_wait" ]; then - log "ERROR: Timeout waiting for vault-runner job to complete" - write_result "$action_id" 1 "Timeout waiting for nomad job to complete" - return 1 - fi - - # Get final job status and exit code - local final_status_json - final_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { - log "ERROR: Failed to get final job status" - write_result "$action_id" 1 "Failed to get final job status" - return 1 - } - - # Get allocation exit code - local exit_code=0 - local logs="" - - if [ -n "$alloc_id" ]; then - # Get allocation logs - logs=$(nomad alloc logs -short "$alloc_id" 2>/dev/null || true) - - # Try to get exit code from alloc status JSON - # Nomad alloc status -json has .TaskStates[""].Events[].ExitCode - local alloc_exit_code - alloc_exit_code=$(nomad alloc status -json "$alloc_id" 2>/dev/null | jq -r '.TaskStates["runner"].Events[-1].ExitCode // empty' 2>/dev/null) || alloc_exit_code="" - - if [ -n "$alloc_exit_code" ] && [ "$alloc_exit_code" != "null" ]; then - exit_code="$alloc_exit_code" - fi - fi - - # If we couldn't get exit code from alloc, check job state as fallback - # Note: "dead" = terminal state for batch jobs (includes successful completion) - # Only "failed" indicates actual failure - if [ "$exit_code" -eq 0 ]; then - local final_state - final_state=$(echo "$final_status_json" | jq -r '.Status // empty' 2>/dev/null) || final_state="" - - case "$final_state" in - failed) - exit_code=1 - ;; - esac - fi - - # Truncate logs if too long - if [ ${#logs} -gt 1000 ]; then - logs="${logs: -1000}" - fi - - # Write result file - write_result "$action_id" "$exit_code" "$logs" - - if [ "$exit_code" -eq 0 ]; then - log "Vault-runner job completed successfully for action: ${action_id}" - else - log "Vault-runner job failed for action: ${action_id} (exit code: ${exit_code})" - fi - - return "$exit_code" + echo "nomad backend not yet implemented" >&2 + return 1 } # Launch runner for the given action (backend-agnostic orchestrator) @@ -1209,8 +1051,11 @@ main() { # Validate backend selection at startup case "$DISPATCHER_BACKEND" in - docker|nomad) - log "Using ${DISPATCHER_BACKEND} backend for vault-runner dispatch" + docker) ;; + nomad) + log "ERROR: nomad backend not yet implemented" + echo "nomad backend not yet implemented" >&2 + exit 1 ;; *) log "ERROR: unknown DISPATCHER_BACKEND=${DISPATCHER_BACKEND}" diff --git a/docker/edge/entrypoint-edge.sh b/docker/edge/entrypoint-edge.sh index 83131fb..1b5f94f 100755 --- a/docker/edge/entrypoint-edge.sh +++ b/docker/edge/entrypoint-edge.sh @@ -173,15 +173,11 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}" sleep 1200 # 20 minutes done) & -# ── Load optional secrets from secrets/*.enc (#777) ──────────────────── -# Engagement collection (collect-engagement.sh) requires CADDY_ secrets to -# SCP access logs from a remote edge host. When age key or secrets dir is -# missing, or any secret fails to decrypt, log a warning and skip the cron. -# Caddy itself does not depend on these secrets. +# ── Load required secrets from secrets/*.enc (#777) ──────────────────── +# Edge container declares its required secrets; missing ones cause a hard fail. _AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt" _SECRETS_DIR="/opt/disinto/secrets" EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG" -EDGE_ENGAGEMENT_READY=0 # Assume not ready until proven otherwise _edge_decrypt_secret() { local enc_path="${_SECRETS_DIR}/${1}.enc" @@ -196,60 +192,47 @@ if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then export "$_secret_name=$_val" done if [ -n "$_missing" ]; then - echo "WARN: required engagement secrets missing from secrets/*.enc:${_missing}" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 - else - echo "edge: loaded required engagement secrets: ${EDGE_REQUIRED_SECRETS}" >&2 - EDGE_ENGAGEMENT_READY=1 + echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2 + echo " Run 'disinto secrets add ' for each missing secret." >&2 + echo " If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2 + exit 1 fi + echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2 else - echo "WARN: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — engagement secrets unavailable" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 + echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2 + echo " Ensure age is installed and secrets/*.enc files are present." >&2 + exit 1 fi # Start daily engagement collection cron loop in background (#745) # Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that # calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777). -# Guarded: only start if EDGE_ENGAGEMENT_READY=1. -if [ "$EDGE_ENGAGEMENT_READY" -eq 1 ]; then - (while true; do - # Calculate seconds until next 23:50 UTC - _now=$(date -u +%s) - _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) - if [ "$_target" -le "$_now" ]; then - _target=$(( _target + 86400 )) - fi - _sleep_secs=$(( _target - _now )) - echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 - sleep "$_sleep_secs" - _fetch_log="/tmp/caddy-access-log-fetch.log" - _ssh_key_file=$(mktemp) - printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" - chmod 0600 "$_ssh_key_file" - scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ - "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ - "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true - rm -f "$_ssh_key_file" - if [ -s "$_fetch_log" ]; then - CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ - | tee -a /opt/disinto-logs/collect-engagement.log || true - else - echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 - fi - rm -f "$_fetch_log" - done) & -else - echo "edge: collect-engagement cron skipped (EDGE_ENGAGEMENT_READY=0)" >&2 -fi - -# Nomad template renders Caddyfile to /local/Caddyfile via service discovery; -# copy it into the expected location if present (compose uses the mounted path). -if [ -f /local/Caddyfile ]; then - cp /local/Caddyfile /etc/caddy/Caddyfile - echo "edge: using Nomad-rendered Caddyfile from /local/Caddyfile" >&2 -fi +(while true; do + # Calculate seconds until next 23:50 UTC + _now=$(date -u +%s) + _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) + if [ "$_target" -le "$_now" ]; then + _target=$(( _target + 86400 )) + fi + _sleep_secs=$(( _target - _now )) + echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 + sleep "$_sleep_secs" + _fetch_log="/tmp/caddy-access-log-fetch.log" + _ssh_key_file=$(mktemp) + printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" + chmod 0600 "$_ssh_key_file" + scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ + "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true + rm -f "$_ssh_key_file" + if [ -s "$_fetch_log" ]; then + CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ + | tee -a /opt/disinto-logs/collect-engagement.log || true + else + echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 + fi + rm -f "$_fetch_log" +done) & # Caddy as main process — run in foreground via wait so background jobs survive # (exec replaces the shell, which can orphan backgrounded subshells) diff --git a/docs/agents-llama.md b/docs/agents-llama.md index b3a1334..bc973b7 100644 --- a/docs/agents-llama.md +++ b/docs/agents-llama.md @@ -2,12 +2,9 @@ Local-model agents run the same agent code as the Claude-backed agents, but connect to a local llama-server (or compatible OpenAI-API endpoint) instead of -the Anthropic API. This document describes the canonical activation flow using +the Anthropic API. This document describes the current activation flow using `disinto hire-an-agent` and `[agents.X]` TOML configuration. -> **Note:** The legacy `ENABLE_LLAMA_AGENT=1` env flag has been removed (#846). -> Activation is now done exclusively via `[agents.X]` sections in project TOML. - ## Overview Local-model agents are configured via `[agents.]` sections in diff --git a/docs/nomad-migration.md b/docs/nomad-migration.md deleted file mode 100644 index 02ff023..0000000 --- a/docs/nomad-migration.md +++ /dev/null @@ -1,124 +0,0 @@ - -# Nomad+Vault migration — cutover-day runbook - -`disinto init --backend=nomad` is the single entry-point that turns a fresh -LXC (with the disinto repo cloned) into a running Nomad+Vault cluster with -policies applied, JWT workload-identity auth configured, secrets imported -from the old docker stack, and services deployed. - -## Cutover-day invocation - -On the new LXC, as root (or an operator with NOPASSWD sudo): - -```bash -# Copy the plaintext .env + sops-encrypted .env.vault.enc + age keyfile -# from the old box first (out of band — SSH, USB, whatever your ops -# procedure allows). Then: - -sudo ./bin/disinto init \ - --backend=nomad \ - --import-env /tmp/.env \ - --import-sops /tmp/.env.vault.enc \ - --age-key /tmp/keys.txt \ - --with forgejo -``` - -This runs, in order: - -1. **`lib/init/nomad/cluster-up.sh`** (S0) — installs Nomad + Vault - binaries, writes `/etc/nomad.d/*`, initializes Vault, starts both - services, waits for the Nomad node to become ready. -2. **`tools/vault-apply-policies.sh`** (S2.1) — syncs every - `vault/policies/*.hcl` into Vault as an ACL policy. Idempotent. -3. **`lib/init/nomad/vault-nomad-auth.sh`** (S2.3) — enables Vault's - JWT auth method at `jwt-nomad`, points it at Nomad's JWKS, writes - one role per policy, reloads Nomad so jobs can exchange - workload-identity tokens for Vault tokens. Idempotent. -4. **`tools/vault-import.sh`** (S2.2) — reads `/tmp/.env` and the - sops-decrypted `/tmp/.env.vault.enc`, writes them to the KV paths - matching the S2.1 policy layout (`kv/disinto/bots/*`, `kv/disinto/shared/*`, - `kv/disinto/runner/*`). Idempotent (overwrites KV v2 data in place). -5. **`lib/init/nomad/deploy.sh forgejo`** (S1) — validates + runs the - `nomad/jobs/forgejo.hcl` jobspec. Forgejo reads its admin creds from - Vault via the `template` stanza (S2.4). - -## Flag summary - -| Flag | Meaning | -|---|---| -| `--backend=nomad` | Switch the init dispatcher to the Nomad+Vault path (instead of docker compose). | -| `--empty` | Bring the cluster up, skip policies/auth/import/deploy. Escape hatch for debugging. | -| `--with forgejo[,…]` | Deploy these services after the cluster is up. | -| `--import-env PATH` | Plaintext `.env` from the old stack. Optional. | -| `--import-sops PATH` | Sops-encrypted `.env.vault.enc` from the old stack. Requires `--age-key`. | -| `--age-key PATH` | Age keyfile used to decrypt `--import-sops`. Requires `--import-sops`. | -| `--dry-run` | Print the full plan (cluster-up + policies + auth + import + deploy) and exit. Touches nothing. | - -### Flag validation - -- `--import-sops` without `--age-key` → error. -- `--age-key` without `--import-sops` → error. -- `--import-env` alone (no sops) → OK (imports just the plaintext `.env`). -- `--backend=docker` with any `--import-*` flag → error. -- `--empty` with any `--import-*` flag → error (mutually exclusive: `--empty` - skips the import step, so pairing them silently discards the import - intent). - -## Idempotency - -Every layer is idempotent by design. Re-running the same command on an -already-provisioned box is a no-op at every step: - -- **Cluster-up:** second run detects running `nomad`/`vault` systemd - units and state files, skips re-init. -- **Policies:** byte-for-byte compare against on-server policy text; - "unchanged" for every untouched file. -- **Auth:** skips auth-method create if `jwt-nomad/` already enabled, - skips config write if the JWKS + algs match, skips server.hcl write if - the file on disk is identical to the repo copy. -- **Import:** KV v2 writes overwrite in place (same path, same keys, - same values → no new version). -- **Deploy:** `nomad job run` is declarative; same jobspec → no new - allocation. - -## Dry-run - -```bash -./bin/disinto init --backend=nomad \ - --import-env /tmp/.env \ - --import-sops /tmp/.env.vault.enc \ - --age-key /tmp/keys.txt \ - --with forgejo \ - --dry-run -``` - -Prints the five-section plan — cluster-up, policies, auth, import, -deploy — with every path and every argv that would be executed. No -network, no sudo, no state mutation. See -`tests/disinto-init-nomad.bats` for the exact output shape. - -## No-import path - -If you already have `kv/disinto/*` seeded by other means (manual -`vault kv put`, a replica, etc.), omit all three `--import-*` flags. -`disinto init --backend=nomad --with forgejo` still applies policies, -configures auth, and deploys — but skips the import step with: - -``` -[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services -``` - -Forgejo's template stanza will fail to render (and thus the allocation -will stall) until those KV paths exist — so either import them or seed -them first. - -## Secret hygiene - -- Never log a secret value. The CLI only prints paths (`--import-env`, - `--age-key`) and KV *paths* (`kv/disinto/bots/review/token`), never - the values themselves. `tools/vault-import.sh` is the only thing that - reads the values, and it pipes them directly into Vault's HTTP API. -- The age keyfile must be mode 0400 — `vault-import.sh` refuses to - source a keyfile with looser permissions. -- `VAULT_ADDR` must be localhost during import — the import tool - refuses to run against a remote Vault, preventing accidental exposure. diff --git a/formulas/release.sh b/formulas/release.sh index 6526d1a..b8c4eb6 100644 --- a/formulas/release.sh +++ b/formulas/release.sh @@ -178,8 +178,8 @@ log "Tagged disinto/agents:${RELEASE_VERSION}" log "Step 6/6: Restarting agent containers" -docker compose stop agents 2>/dev/null || true -docker compose up -d agents +docker compose stop agents agents-llama 2>/dev/null || true +docker compose up -d agents agents-llama log "Agent containers restarted" # ── Done ───────────────────────────────────────────────────────────────── diff --git a/formulas/release.toml b/formulas/release.toml index ccd7f95..f702f42 100644 --- a/formulas/release.toml +++ b/formulas/release.toml @@ -189,10 +189,10 @@ Restart agent containers to use the new image. - docker compose pull agents 2. Stop and remove existing agent containers: - - docker compose down agents + - docker compose down agents agents-llama 2>/dev/null || true 3. Start agents with new image: - - docker compose up -d agents + - docker compose up -d agents agents-llama 4. Wait for containers to be healthy: - for i in {1..30}; do @@ -203,7 +203,7 @@ Restart agent containers to use the new image. - done 5. Verify containers are running: - - docker compose ps agents + - docker compose ps agents agents-llama 6. Log restart: - echo "Restarted agents containers" diff --git a/formulas/run-supervisor.toml b/formulas/run-supervisor.toml index 4101252..f31e6bc 100644 --- a/formulas/run-supervisor.toml +++ b/formulas/run-supervisor.toml @@ -29,7 +29,7 @@ and injected into your prompt above. Review them now. 1. Read the injected metrics data carefully (System Resources, Docker, Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs, - CI Pipelines, Open PRs, Issue Status, Stale Worktrees, **Woodpecker Agent Health**). + CI Pipelines, Open PRs, Issue Status, Stale Worktrees). Note: preflight.sh auto-removes PHASE:escalate files for closed issues (24h grace period). Check the "Stale Phase Cleanup" section for any files cleaned or in grace period this run. @@ -75,10 +75,6 @@ Categorize every finding from the metrics into priority levels. - Dev/action sessions in PHASE:escalate for > 24h (session timeout) (Note: PHASE:escalate files for closed issues are auto-cleaned by preflight; this check covers sessions where the issue is still open) -- **Woodpecker agent unhealthy** — see "Woodpecker Agent Health" section in preflight: - - Container not running or in unhealthy state - - gRPC errors >= 3 in last 20 minutes - - Fast-failure pipelines (duration < 60s) >= 3 in last 15 minutes ### P3 — Factory degraded - PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed @@ -104,15 +100,6 @@ For each finding from the health assessment, decide and execute an action. ### Auto-fixable (execute these directly) -**P2 Woodpecker agent unhealthy:** -The supervisor-run.sh script automatically handles WP agent recovery: -- Detects unhealthy state via preflight.sh health checks -- Restarts container via `docker restart` -- Scans for `blocked: ci_exhausted` issues updated in last 30 minutes -- Unassigns and removes blocked label from affected issues -- Posts recovery comment with infra-flake context -- Avoids duplicate restarts via 5-minute cooldown in history file - **P0 Memory crisis:** # Kill stale one-shot claude processes (>3h old) pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true @@ -261,11 +248,6 @@ Format: - (or "No actions needed") - ### WP Agent Recovery (if applicable) - - WP agent restart: