diff --git a/.woodpecker/detect-duplicates.py b/.woodpecker/detect-duplicates.py index f3bf5b1..58fc160 100644 --- a/.woodpecker/detect-duplicates.py +++ b/.woodpecker/detect-duplicates.py @@ -301,28 +301,6 @@ def main() -> int: "9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)", "9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)", "5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)", - # Common vault-seed script preamble + precondition patterns - # Shared across tools/vault-seed-{forgejo,agents,woodpecker}.sh - "dff3675c151fcdbd2fef798826ae919b": "Vault-seed preamble: set -euo + path setup + source hvault.sh + KV_MOUNT", - "1cd9f0d083e24e6e6b2071db9b6dae09": "Vault-seed preconditions: binary check loop + VAULT_ADDR guard", - "63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die", - "34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup", - "71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die", - # Common vault-seed script flag parsing patterns - # Shared across tools/vault-seed-{forgejo,ops-repo}.sh - "6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)", - "a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)", - "e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)", - "c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)", - "106f4071e88f841b3208b01144cd1c39": "vault-seed case pattern dry-run end (forgejo + ops-repo)", - "c15506dcb6bb340b25d1c39d442dd2e6": "vault-seed help text + invalid arg handler (forgejo + ops-repo)", - "1feecd3b3caf00045fae938ddf2811de": "vault-seed invalid arg handler (forgejo + ops-repo)", - "919780d5e7182715344f5aa02b191294": "vault-seed invalid arg + esac pattern (forgejo + ops-repo)", - "8dce1d292bce8e60ef4c0665b62945b0": "vault-seed esac + binary check loop (forgejo + ops-repo)", - "ca043687143a5b47bd54e65a99ce8ee8": "vault-seed binary check loop start (forgejo + ops-repo)", - "aefd9f655411a955395e6e5995ddbe6f": "vault-seed binary check pattern (forgejo + ops-repo)", - "60f0c46deb5491599457efb4048918e5": "vault-seed VAULT_ADDR + hvault_token_lookup check (forgejo + ops-repo)", - "f6838f581ef6b4d82b55268389032769": "vault-seed VAULT_ADDR + hvault_token_lookup die (forgejo + ops-repo)", } if not sh_files: diff --git a/AGENTS.md b/AGENTS.md index 9c42667..fced0c6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ - + # Disinto — Agent Instructions ## What this repo is @@ -37,9 +37,9 @@ disinto/ (code repo) │ examples/ — example vault action TOMLs (promote, publish, release, webhook-call) ├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh │ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure) -│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825); wp-oauth-register.sh (Forgejo OAuth2 app + Vault KV seeder for Woodpecker, S3.3); deploy.sh (dependency-ordered Nomad job deploy + health-wait, S4) -├── nomad/ server.hcl, client.hcl (allow_privileged for woodpecker-agent, S3-fix-5), vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh -│ jobs/ — Nomad jobspecs: forgejo.hcl (Vault secrets via template, S2.4); woodpecker-server.hcl + woodpecker-agent.hcl (host-net, docker.sock, Vault KV, S3.1-S3.2); agents.hcl (7 roles, llama, Vault-templated bot tokens, S4.1); vault-runner.hcl (parameterized batch dispatch, S5.3); staging.hcl (Caddy file-server, dynamic port — edge discovers via service registration, S5.2); chat.hcl (Claude chat UI, tmpfs via mount block, Vault OAuth secrets, S5.2); edge.hcl (Caddy proxy + dispatcher sidecar, S5.1) +│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825) +├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh +│ jobs/ — Nomad jobspecs (forgejo.hcl reads Vault secrets via template stanza, S2.4) ├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored) ├── formulas/ Issue templates (TOML specs for multi-step agent tasks) ├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/) diff --git a/architect/AGENTS.md b/architect/AGENTS.md index 7286ee3..51b24b1 100644 --- a/architect/AGENTS.md +++ b/architect/AGENTS.md @@ -1,4 +1,4 @@ - + # Architect — Agent Instructions ## What this agent is diff --git a/bin/disinto b/bin/disinto index 7f6379d..ca1da71 100755 --- a/bin/disinto +++ b/bin/disinto @@ -82,7 +82,7 @@ Init options: --ci-id Woodpecker CI repo ID (default: 0 = no CI) --forge-url Forge base URL (default: http://localhost:3000) --backend Orchestration backend: docker (default) | nomad - --with (nomad) Deploy services: forgejo,woodpecker,agents,staging,chat,edge[,...] (S1.3, S3.4, S4.2, S5.2, S5.5) + --with (nomad) Deploy services: forgejo,woodpecker-server,woodpecker-agent[,...] (S1.3, S3.4) --empty (nomad) Bring up cluster only, no jobs (S0.4) --bare Skip compose generation (bare-metal setup) --build Use local docker build instead of registry images (dev mode) @@ -783,37 +783,60 @@ _disinto_init_nomad() { fi if [ -n "$with_services" ]; then - # Interleaved seed/deploy per service (S2.6, #928, #948): match the - # real-run path so dry-run output accurately represents execution order. - # Build ordered deploy list: only include services present in with_services + # Normalize services: auto-include forgejo when woodpecker is requested + # (woodpecker without forgejo is nonsensical) + local normalized_services="$with_services" + if echo "$with_services" | grep -q "woodpecker" && ! echo "$with_services" | grep -q "forgejo"; then + echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)" + normalized_services="forgejo,${with_services}" + fi + + # Define deployment order: forgejo -> woodpecker-server -> woodpecker-agent + # Only include services that are requested (after normalization) local DEPLOY_ORDER="" - for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do - if echo ",$with_services," | grep -q ",$ordered_svc,"; then - DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" + for ordered_svc in forgejo woodpecker-server woodpecker-agent; do + if echo ",$normalized_services," | grep -q ",$ordered_svc,"; then + if [ -z "$DEPLOY_ORDER" ]; then + DEPLOY_ORDER="$ordered_svc" + else + DEPLOY_ORDER="$DEPLOY_ORDER $ordered_svc" + fi fi done - local IFS=' ' + # Vault seed plan (S2.6, #928): one line per service whose + # tools/vault-seed-.sh ships. Services without a seeder are + # silently skipped — the real-run loop below mirrors this, + # making `--with woodpecker` in Step 3 auto-invoke + # tools/vault-seed-woodpecker.sh once that file lands without + # any further change to bin/disinto. + local seed_hdr_printed=false + local IFS=',' + for svc in $normalized_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + local seed_script="${FACTORY_ROOT}/tools/vault-seed-${svc}.sh" + if [ -x "$seed_script" ]; then + if [ "$seed_hdr_printed" = false ]; then + echo "── Vault seed dry-run ─────────────────────────────────" + seed_hdr_printed=true + fi + echo "[seed] [dry-run] ${seed_script} --dry-run" + fi + done + [ "$seed_hdr_printed" = true ] && echo "" + + echo "── Deploy services dry-run ────────────────────────────" + echo "[deploy] services to deploy: ${normalized_services}" echo "[deploy] deployment order: ${DEPLOY_ORDER}" for svc in $DEPLOY_ORDER; do - # Seed this service (if seed script exists) - local seed_name="$svc" + # Validate known services first case "$svc" in - woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; - agents) seed_name="agents" ;; - chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; + forgejo|woodpecker-server|woodpecker-agent) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo, woodpecker-server, woodpecker-agent" >&2 + exit 1 + ;; esac - local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" - if [ -x "$seed_script" ]; then - echo "── Vault seed dry-run ─────────────────────────────────" - echo "[seed] [dry-run] ${seed_script} --dry-run" - echo "" - fi - - # Deploy this service - echo "── Deploy services dry-run ────────────────────────────" - echo "[deploy] services to deploy: ${with_services}" local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 @@ -824,32 +847,6 @@ _disinto_init_nomad() { done echo "[deploy] dry-run complete" fi - - # Dry-run vault-runner (unconditionally, not gated by --with) - echo "" - echo "── Vault-runner dry-run ───────────────────────────────────" - local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" - if [ -f "$vault_runner_path" ]; then - echo "[deploy] vault-runner: [dry-run] nomad job validate ${vault_runner_path}" - echo "[deploy] vault-runner: [dry-run] nomad job run -detach ${vault_runner_path}" - else - echo "[deploy] vault-runner: jobspec not found, skipping" - fi - - # Build custom images dry-run (if agents, chat, or edge services are included) - if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then - echo "" - echo "── Build images dry-run ──────────────────────────────" - if echo ",$with_services," | grep -q ",agents,"; then - echo "[build] [dry-run] docker build -t disinto/agents:local -f ${FACTORY_ROOT}/docker/agents/Dockerfile ${FACTORY_ROOT}" - fi - if echo ",$with_services," | grep -q ",chat,"; then - echo "[build] [dry-run] docker build -t disinto/chat:local -f ${FACTORY_ROOT}/docker/chat/Dockerfile ${FACTORY_ROOT}/docker/chat" - fi - if echo ",$with_services," | grep -q ",edge,"; then - echo "[build] [dry-run] docker build -t disinto/edge:local -f ${FACTORY_ROOT}/docker/edge/Dockerfile ${FACTORY_ROOT}/docker/edge" - fi - fi exit 0 fi @@ -937,127 +934,114 @@ _disinto_init_nomad() { echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" fi - # Build custom images required by Nomad jobs (S4.2, S5.2, S5.5) — before deploy. - # Single-node factory dev box: no multi-node pull needed, no registry auth. - # Can upgrade to approach B (registry push/pull) later if multi-node. - if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then - echo "" - echo "── Building custom images ─────────────────────────────" - if echo ",$with_services," | grep -q ",agents,"; then - local tag="disinto/agents:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/agents/Dockerfile" "${FACTORY_ROOT}" 2>&1 | tail -5 - fi - if echo ",$with_services," | grep -q ",chat,"; then - local tag="disinto/chat:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/chat/Dockerfile" "${FACTORY_ROOT}/docker/chat" 2>&1 | tail -5 - fi - if echo ",$with_services," | grep -q ",edge,"; then - local tag="disinto/edge:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/edge/Dockerfile" "${FACTORY_ROOT}/docker/edge" 2>&1 | tail -5 - fi - fi - - # Interleaved seed/deploy per service (S2.6, #928, #948). - # We interleave seed + deploy per service (not batch all seeds then all deploys) - # so that OAuth-dependent services can reach their dependencies during seeding. - # E.g., seed-forgejo → deploy-forgejo → seed-woodpecker (OAuth can now reach - # running forgejo) → deploy-woodpecker. + # Seed Vault for services that ship their own seeder (S2.6, #928). + # Convention: tools/vault-seed-.sh — auto-invoked when --with + # is requested. Runs AFTER vault-import so that real imported values + # win over generated seeds when both are present; each seeder is + # idempotent on a per-key basis (see vault-seed-forgejo.sh's + # "missing → generate, present → unchanged" contract), so re-running + # init does not rotate existing keys. Services without a seeder are + # silently skipped — keeps this loop forward-compatible with Step 3+ + # services that may ship their own seeder without touching bin/disinto. + # + # VAULT_ADDR is passed explicitly because cluster-up.sh writes the + # profile.d export *during* this same init run, so the current shell + # hasn't sourced it yet; sibling vault-* scripts (engines/policies/ + # auth/import) default VAULT_ADDR internally via _hvault_default_env, + # but vault-seed-forgejo.sh requires the caller to set it. + # + # The non-root branch invokes the seeder as `sudo -n -- env VAR=val + # script` rather than `sudo -n VAR=val -- script`: sudo treats bare + # `VAR=val` args as sudoers env-assignments, which the default + # `env_reset=on` policy silently discards unless the variable is in + # `env_keep` (VAULT_ADDR is not). Using `env` as the actual command + # sets VAULT_ADDR in the child process regardless of sudoers policy. if [ -n "$with_services" ]; then + # Normalize services: auto-include forgejo when woodpecker is requested + local normalized_services="$with_services" + if echo "$with_services" | grep -q "woodpecker" && ! echo "$with_services" | grep -q "forgejo"; then + normalized_services="forgejo,${with_services}" + fi + local vault_addr="${VAULT_ADDR:-http://127.0.0.1:8200}" - - # Build ordered deploy list (S3.4, S4.2, S5.2, S5.5): forgejo → woodpecker-server → woodpecker-agent → agents → staging → chat → edge - local DEPLOY_ORDER="" - for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do - if echo ",$with_services," | grep -q ",$ordered_svc,"; then - DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" - fi - done - - local IFS=' ' - for svc in $DEPLOY_ORDER; do - # Seed this service (if seed script exists) - local seed_name="$svc" - case "$svc" in - woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; - agents) seed_name="agents" ;; - chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; - esac - local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" + local IFS=',' + for svc in $normalized_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + local seed_script="${FACTORY_ROOT}/tools/vault-seed-${svc}.sh" if [ -x "$seed_script" ]; then echo "" - echo "── Seeding Vault for ${seed_name} ───────────────────────────" + echo "── Seeding Vault for ${svc} ───────────────────────────" if [ "$(id -u)" -eq 0 ]; then VAULT_ADDR="$vault_addr" "$seed_script" || exit $? else if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-seed-${seed_name}.sh must run as root and sudo is not installed" >&2 + echo "Error: vault-seed-${svc}.sh must run as root and sudo is not installed" >&2 exit 1 fi sudo -n -- env "VAULT_ADDR=$vault_addr" "$seed_script" || exit $? fi fi + done + fi - # Deploy this service - echo "" - echo "── Deploying ${svc} ───────────────────────────────────────" + # Deploy services if requested + if [ -n "$with_services" ]; then + # Normalize services: auto-include forgejo when woodpecker is requested + # (woodpecker without forgejo is nonsensical) + local normalized_services="$with_services" + if echo "$with_services" | grep -q "woodpecker" && ! echo "$with_services" | grep -q "forgejo"; then + echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)" + normalized_services="forgejo,${with_services}" + fi - # Seed host volumes before deployment (if needed) + # Define deployment order: forgejo -> woodpecker-server -> woodpecker-agent + # Only include services that are requested (after normalization) + local DEPLOY_ORDER="" + for ordered_svc in forgejo woodpecker-server woodpecker-agent; do + if echo ",$normalized_services," | grep -q ",$ordered_svc,"; then + if [ -z "$DEPLOY_ORDER" ]; then + DEPLOY_ORDER="$ordered_svc" + else + DEPLOY_ORDER="$DEPLOY_ORDER $ordered_svc" + fi + fi + done + + echo "" + echo "── Deploying services ─────────────────────────────────" + local -a deploy_cmd=("$deploy_sh") + # Split comma-separated service list into positional args (in deploy order) + local IFS=' ' + for svc in $DEPLOY_ORDER; do + if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then + echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2 + exit 1 + fi + # Validate known services FIRST (before jobspec check) case "$svc" in - staging) - # Seed site-content host volume (/srv/disinto/docker) with static content - # The staging jobspec mounts this volume read-only to /srv/site - local site_content_src="${FACTORY_ROOT}/docker/index.html" - local site_content_dst="/srv/disinto/docker" - if [ -f "$site_content_src" ] && [ -d "$site_content_dst" ]; then - if ! cmp -s "$site_content_src" "${site_content_dst}/index.html" 2>/dev/null; then - echo "[staging] seeding site-content volume..." - cp "$site_content_src" "${site_content_dst}/index.html" - fi - fi + forgejo|woodpecker-server|woodpecker-agent) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo, woodpecker-server, woodpecker-agent" >&2 + exit 1 ;; esac - + # Check jobspec exists local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 exit 1 fi - - local -a deploy_cmd=("$deploy_sh" "$svc") - if [ "$(id -u)" -eq 0 ]; then - "${deploy_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${deploy_cmd[@]}" || exit $? - fi + deploy_cmd+=("$svc") done - # Run vault-runner (unconditionally, not gated by --with) — infrastructure job - # vault-runner is always present since it's needed for vault action dispatch - echo "" - echo "── Running vault-runner ────────────────────────────────────" - local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" - if [ -f "$vault_runner_path" ]; then - echo "[deploy] vault-runner: running Nomad job (infrastructure)" - local -a vault_runner_cmd=("$deploy_sh" "vault-runner") - if [ "$(id -u)" -eq 0 ]; then - "${vault_runner_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${vault_runner_cmd[@]}" || exit $? - fi + if [ "$(id -u)" -eq 0 ]; then + "${deploy_cmd[@]}" || exit $? else - echo "[deploy] vault-runner: jobspec not found, skipping" + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: deploy.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${deploy_cmd[@]}" || exit $? fi # Print final summary @@ -1074,25 +1058,16 @@ _disinto_init_nomad() { else echo "Imported: (none — seed kv/disinto/* manually before deploying secret-dependent services)" fi - echo "Deployed: ${with_services}" - if echo ",$with_services," | grep -q ",forgejo,"; then + echo "Deployed: ${normalized_services}" + if echo "$normalized_services" | grep -q "forgejo"; then echo "Ports: forgejo: 3000" fi - if echo ",$with_services," | grep -q ",woodpecker-server,"; then + if echo "$normalized_services" | grep -q "woodpecker-server"; then echo " woodpecker-server: 8000" fi - if echo ",$with_services," | grep -q ",woodpecker-agent,"; then + if echo "$normalized_services" | grep -q "woodpecker-agent"; then echo " woodpecker-agent: (agent connected)" fi - if echo ",$with_services," | grep -q ",agents,"; then - echo " agents: (polling loop running)" - fi - if echo ",$with_services," | grep -q ",staging,"; then - echo " staging: (internal, no external port)" - fi - if echo ",$with_services," | grep -q ",chat,"; then - echo " chat: 8080" - fi echo "────────────────────────────────────────────────────────" fi @@ -1178,70 +1153,6 @@ disinto_init() { exit 1 fi - # Normalize --with services (S3.4): expand 'woodpecker' shorthand to - # 'woodpecker-server,woodpecker-agent', auto-include forgejo when - # woodpecker is requested (OAuth dependency), and validate all names. - if [ -n "$with_services" ]; then - # Expand 'woodpecker' (bare) → 'woodpecker-server,woodpecker-agent'. - # Must not match already-expanded 'woodpecker-server'/'woodpecker-agent'. - local expanded="" - local IFS=',' - for _svc in $with_services; do - _svc=$(echo "$_svc" | xargs) - case "$_svc" in - woodpecker) _svc="woodpecker-server,woodpecker-agent" ;; - agents) _svc="agents" ;; - esac - expanded="${expanded:+${expanded},}${_svc}" - done - with_services="$expanded" - unset IFS - - # Auto-include forgejo when woodpecker is requested - if echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent," \ - && ! echo ",$with_services," | grep -q ",forgejo,"; then - echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)" - with_services="forgejo,${with_services}" - fi - - # Auto-include forgejo and woodpecker when agents is requested - if echo ",$with_services," | grep -q ",agents,"; then - if ! echo ",$with_services," | grep -q ",forgejo,"; then - echo "Note: --with agents implies --with forgejo (agents need forge)" - with_services="forgejo,${with_services}" - fi - if ! echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent,"; then - echo "Note: --with agents implies --with woodpecker (agents need CI)" - with_services="${with_services},woodpecker-server,woodpecker-agent" - fi - fi - - # Auto-include all dependencies when edge is requested (S5.5) - if echo ",$with_services," | grep -q ",edge,"; then - # Edge depends on all backend services - for dep in forgejo woodpecker-server woodpecker-agent agents staging chat; do - if ! echo ",$with_services," | grep -q ",${dep},"; then - echo "Note: --with edge implies --with ${dep} (edge depends on all backend services)" - with_services="${with_services},${dep}" - fi - done - fi - - # Validate all service names are known - local IFS=',' - for _svc in $with_services; do - _svc=$(echo "$_svc" | xargs) - case "$_svc" in - forgejo|woodpecker-server|woodpecker-agent|agents|staging|chat|edge) ;; - *) - echo "Error: unknown service '${_svc}' — known: forgejo, woodpecker-server, woodpecker-agent, agents, staging, chat, edge" >&2 - exit 1 - ;; - esac - done - unset IFS - fi - # --import-* flag validation (S2.5). These three flags form an import # triple and must be consistent before dispatch: sops encryption is # useless without the age key to decrypt it, so either both --import-sops diff --git a/dev/AGENTS.md b/dev/AGENTS.md index c64551f..02fd612 100644 --- a/dev/AGENTS.md +++ b/dev/AGENTS.md @@ -1,4 +1,4 @@ - + # Dev Agent **Role**: Implement issues autonomously — write code, push branches, address diff --git a/docker-compose.yml b/docker-compose.yml index c4676f2..ba8c77c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,6 +15,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -77,6 +78,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -137,6 +139,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro diff --git a/docker/agents/Dockerfile b/docker/agents/Dockerfile index fa3b2d8..1bcba89 100644 --- a/docker/agents/Dockerfile +++ b/docker/agents/Dockerfile @@ -1,26 +1,21 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ - bash curl git jq tmux nodejs npm python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ + bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ && pip3 install --break-system-packages networkx tomlkit \ && rm -rf /var/lib/apt/lists/* # Pre-built binaries (copied from docker/agents/bin/) # SOPS — encrypted data decryption tool -# Download sops binary (replaces manual COPY of vendored binary) -ARG SOPS_VERSION=3.9.4 -RUN curl -fsSL "https://github.com/getsops/sops/releases/download/v${SOPS_VERSION}/sops-v${SOPS_VERSION}.linux.amd64" \ - -o /usr/local/bin/sops && chmod +x /usr/local/bin/sops +COPY docker/agents/bin/sops /usr/local/bin/sops +RUN chmod +x /usr/local/bin/sops # tea CLI — official Gitea/Forgejo CLI for issue/label/comment operations -# Download tea binary (replaces manual COPY of vendored binary) -ARG TEA_VERSION=0.9.2 -RUN curl -fsSL "https://dl.gitea.com/tea/${TEA_VERSION}/tea-${TEA_VERSION}-linux-amd64" \ - -o /usr/local/bin/tea && chmod +x /usr/local/bin/tea +COPY docker/agents/bin/tea /usr/local/bin/tea +RUN chmod +x /usr/local/bin/tea -# Install Claude Code CLI — agent runtime for all LLM backends (llama, Claude API). -# The CLI is the execution environment; ANTHROPIC_BASE_URL selects the model provider. -RUN npm install -g @anthropic-ai/claude-code@2.1.84 +# Claude CLI is mounted from the host via docker-compose volume. +# No internet access to cli.anthropic.com required at build time. # Non-root user RUN useradd -m -u 1000 -s /bin/bash agent diff --git a/docker/chat/Dockerfile b/docker/chat/Dockerfile index c4cb28b..3d89863 100644 --- a/docker/chat/Dockerfile +++ b/docker/chat/Dockerfile @@ -1,22 +1,20 @@ # disinto-chat — minimal HTTP backend for Claude chat UI # -# Small Debian slim base with Python runtime and Node.js. +# Small Debian slim base with Python runtime. # Chosen for simplicity and small image size (~100MB). # # Image size: ~100MB (well under the 200MB ceiling) # -# Claude CLI is baked into the image — same pattern as the agents container. +# The claude binary is mounted from the host at runtime via docker-compose, +# not baked into the image — same pattern as the agents container. FROM debian:bookworm-slim -# Install Node.js (required for Claude CLI) and Python +# Install Python (no build-time network access needed) RUN apt-get update && apt-get install -y --no-install-recommends \ - nodejs npm python3 \ + python3 \ && rm -rf /var/lib/apt/lists/* -# Install Claude Code CLI — chat backend runtime -RUN npm install -g @anthropic-ai/claude-code@2.1.84 - # Non-root user — fixed UID 10001 for sandbox hardening (#706) RUN useradd -m -u 10001 -s /bin/bash chat diff --git a/docker/edge/dispatcher.sh b/docker/edge/dispatcher.sh index 282342a..a48abf2 100755 --- a/docker/edge/dispatcher.sh +++ b/docker/edge/dispatcher.sh @@ -560,168 +560,10 @@ _launch_runner_docker() { # _launch_runner_nomad ACTION_ID SECRETS_CSV MOUNTS_CSV # -# Dispatches a vault-runner batch job via `nomad job dispatch`. -# Polls `nomad job status` until terminal state (completed/failed). -# Reads exit code from allocation and writes .result.json. -# -# Usage: _launch_runner_nomad -# Returns: exit code of the nomad job (0=success, non-zero=failure) +# Nomad backend stub — will be implemented in migration Step 5. _launch_runner_nomad() { - local action_id="$1" - local secrets_csv="$2" - local mounts_csv="$3" - - log "Dispatching vault-runner batch job via Nomad for action: ${action_id}" - - # Dispatch the parameterized batch job - # The vault-runner job expects meta: action_id, secrets_csv - # Note: mounts_csv is not passed as meta (not declared in vault-runner.hcl) - local dispatch_output - dispatch_output=$(nomad job dispatch \ - -detach \ - -meta action_id="$action_id" \ - -meta secrets_csv="$secrets_csv" \ - vault-runner 2>&1) || { - log "ERROR: Failed to dispatch vault-runner job for ${action_id}" - log "Dispatch output: ${dispatch_output}" - write_result "$action_id" 1 "Nomad dispatch failed: ${dispatch_output}" - return 1 - } - - # Extract dispatched job ID from output (format: "vault-runner/dispatch--") - local dispatched_job_id - dispatched_job_id=$(echo "$dispatch_output" | grep -oP '(?<=Dispatched Job ID = ).+' || true) - - if [ -z "$dispatched_job_id" ]; then - log "ERROR: Could not extract dispatched job ID from nomad output" - log "Dispatch output: ${dispatch_output}" - write_result "$action_id" 1 "Could not extract dispatched job ID from nomad output" - return 1 - fi - - log "Dispatched vault-runner with job ID: ${dispatched_job_id}" - - # Poll job status until terminal state - # Batch jobs transition: running -> completed/failed - local max_wait=300 # 5 minutes max wait - local elapsed=0 - local poll_interval=5 - local alloc_id="" - - log "Polling nomad job status for ${dispatched_job_id}..." - - while [ "$elapsed" -lt "$max_wait" ]; do - # Get job status with JSON output for the dispatched child job - local job_status_json - job_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { - log "ERROR: Failed to get job status for ${dispatched_job_id}" - write_result "$action_id" 1 "Failed to get job status for ${dispatched_job_id}" - return 1 - } - - # Check job status field (transitions to "dead" on completion) - local job_state - job_state=$(echo "$job_status_json" | jq -r '.Status // empty' 2>/dev/null) || job_state="" - - # Check allocation state directly - alloc_id=$(echo "$job_status_json" | jq -r '.Allocations[0]?.ID // empty' 2>/dev/null) || alloc_id="" - - if [ -n "$alloc_id" ]; then - local alloc_state - alloc_state=$(nomad alloc status -short "$alloc_id" 2>/dev/null || true) - - case "$alloc_state" in - *completed*|*success*|*dead*) - log "Allocation ${alloc_id} reached terminal state: ${alloc_state}" - break - ;; - *running*|*pending*|*starting*) - log "Allocation ${alloc_id} still running (state: ${alloc_state})..." - ;; - *failed*|*crashed*) - log "Allocation ${alloc_id} failed (state: ${alloc_state})" - break - ;; - esac - fi - - # Also check job-level state - case "$job_state" in - dead) - log "Job ${dispatched_job_id} reached terminal state: ${job_state}" - break - ;; - failed) - log "Job ${dispatched_job_id} failed" - break - ;; - esac - - sleep "$poll_interval" - elapsed=$((elapsed + poll_interval)) - done - - if [ "$elapsed" -ge "$max_wait" ]; then - log "ERROR: Timeout waiting for vault-runner job to complete" - write_result "$action_id" 1 "Timeout waiting for nomad job to complete" - return 1 - fi - - # Get final job status and exit code - local final_status_json - final_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { - log "ERROR: Failed to get final job status" - write_result "$action_id" 1 "Failed to get final job status" - return 1 - } - - # Get allocation exit code - local exit_code=0 - local logs="" - - if [ -n "$alloc_id" ]; then - # Get allocation logs - logs=$(nomad alloc logs -short "$alloc_id" 2>/dev/null || true) - - # Try to get exit code from alloc status JSON - # Nomad alloc status -json has .TaskStates[""].Events[].ExitCode - local alloc_exit_code - alloc_exit_code=$(nomad alloc status -json "$alloc_id" 2>/dev/null | jq -r '.TaskStates["runner"].Events[-1].ExitCode // empty' 2>/dev/null) || alloc_exit_code="" - - if [ -n "$alloc_exit_code" ] && [ "$alloc_exit_code" != "null" ]; then - exit_code="$alloc_exit_code" - fi - fi - - # If we couldn't get exit code from alloc, check job state as fallback - # Note: "dead" = terminal state for batch jobs (includes successful completion) - # Only "failed" indicates actual failure - if [ "$exit_code" -eq 0 ]; then - local final_state - final_state=$(echo "$final_status_json" | jq -r '.Status // empty' 2>/dev/null) || final_state="" - - case "$final_state" in - failed) - exit_code=1 - ;; - esac - fi - - # Truncate logs if too long - if [ ${#logs} -gt 1000 ]; then - logs="${logs: -1000}" - fi - - # Write result file - write_result "$action_id" "$exit_code" "$logs" - - if [ "$exit_code" -eq 0 ]; then - log "Vault-runner job completed successfully for action: ${action_id}" - else - log "Vault-runner job failed for action: ${action_id} (exit code: ${exit_code})" - fi - - return "$exit_code" + echo "nomad backend not yet implemented" >&2 + return 1 } # Launch runner for the given action (backend-agnostic orchestrator) @@ -1209,8 +1051,11 @@ main() { # Validate backend selection at startup case "$DISPATCHER_BACKEND" in - docker|nomad) - log "Using ${DISPATCHER_BACKEND} backend for vault-runner dispatch" + docker) ;; + nomad) + log "ERROR: nomad backend not yet implemented" + echo "nomad backend not yet implemented" >&2 + exit 1 ;; *) log "ERROR: unknown DISPATCHER_BACKEND=${DISPATCHER_BACKEND}" diff --git a/docker/edge/entrypoint-edge.sh b/docker/edge/entrypoint-edge.sh index 83131fb..1b5f94f 100755 --- a/docker/edge/entrypoint-edge.sh +++ b/docker/edge/entrypoint-edge.sh @@ -173,15 +173,11 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}" sleep 1200 # 20 minutes done) & -# ── Load optional secrets from secrets/*.enc (#777) ──────────────────── -# Engagement collection (collect-engagement.sh) requires CADDY_ secrets to -# SCP access logs from a remote edge host. When age key or secrets dir is -# missing, or any secret fails to decrypt, log a warning and skip the cron. -# Caddy itself does not depend on these secrets. +# ── Load required secrets from secrets/*.enc (#777) ──────────────────── +# Edge container declares its required secrets; missing ones cause a hard fail. _AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt" _SECRETS_DIR="/opt/disinto/secrets" EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG" -EDGE_ENGAGEMENT_READY=0 # Assume not ready until proven otherwise _edge_decrypt_secret() { local enc_path="${_SECRETS_DIR}/${1}.enc" @@ -196,60 +192,47 @@ if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then export "$_secret_name=$_val" done if [ -n "$_missing" ]; then - echo "WARN: required engagement secrets missing from secrets/*.enc:${_missing}" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 - else - echo "edge: loaded required engagement secrets: ${EDGE_REQUIRED_SECRETS}" >&2 - EDGE_ENGAGEMENT_READY=1 + echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2 + echo " Run 'disinto secrets add ' for each missing secret." >&2 + echo " If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2 + exit 1 fi + echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2 else - echo "WARN: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — engagement secrets unavailable" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 + echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2 + echo " Ensure age is installed and secrets/*.enc files are present." >&2 + exit 1 fi # Start daily engagement collection cron loop in background (#745) # Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that # calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777). -# Guarded: only start if EDGE_ENGAGEMENT_READY=1. -if [ "$EDGE_ENGAGEMENT_READY" -eq 1 ]; then - (while true; do - # Calculate seconds until next 23:50 UTC - _now=$(date -u +%s) - _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) - if [ "$_target" -le "$_now" ]; then - _target=$(( _target + 86400 )) - fi - _sleep_secs=$(( _target - _now )) - echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 - sleep "$_sleep_secs" - _fetch_log="/tmp/caddy-access-log-fetch.log" - _ssh_key_file=$(mktemp) - printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" - chmod 0600 "$_ssh_key_file" - scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ - "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ - "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true - rm -f "$_ssh_key_file" - if [ -s "$_fetch_log" ]; then - CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ - | tee -a /opt/disinto-logs/collect-engagement.log || true - else - echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 - fi - rm -f "$_fetch_log" - done) & -else - echo "edge: collect-engagement cron skipped (EDGE_ENGAGEMENT_READY=0)" >&2 -fi - -# Nomad template renders Caddyfile to /local/Caddyfile via service discovery; -# copy it into the expected location if present (compose uses the mounted path). -if [ -f /local/Caddyfile ]; then - cp /local/Caddyfile /etc/caddy/Caddyfile - echo "edge: using Nomad-rendered Caddyfile from /local/Caddyfile" >&2 -fi +(while true; do + # Calculate seconds until next 23:50 UTC + _now=$(date -u +%s) + _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) + if [ "$_target" -le "$_now" ]; then + _target=$(( _target + 86400 )) + fi + _sleep_secs=$(( _target - _now )) + echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 + sleep "$_sleep_secs" + _fetch_log="/tmp/caddy-access-log-fetch.log" + _ssh_key_file=$(mktemp) + printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" + chmod 0600 "$_ssh_key_file" + scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ + "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true + rm -f "$_ssh_key_file" + if [ -s "$_fetch_log" ]; then + CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ + | tee -a /opt/disinto-logs/collect-engagement.log || true + else + echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 + fi + rm -f "$_fetch_log" +done) & # Caddy as main process — run in foreground via wait so background jobs survive # (exec replaces the shell, which can orphan backgrounded subshells) diff --git a/gardener/AGENTS.md b/gardener/AGENTS.md index 5dcd12f..e9ad846 100644 --- a/gardener/AGENTS.md +++ b/gardener/AGENTS.md @@ -1,4 +1,4 @@ - + # Gardener Agent **Role**: Backlog grooming — detect duplicate issues, missing acceptance diff --git a/gardener/dust.jsonl b/gardener/dust.jsonl index e69de29..14b0d5c 100644 --- a/gardener/dust.jsonl +++ b/gardener/dust.jsonl @@ -0,0 +1 @@ +{"issue":915,"group":"lib/generators.sh","title":"remove no-op sed in generate_compose --build mode","reason":"sed replaces agents: with itself — no behavior change; single-line removal","ts":"2026-04-17T01:04:05Z"} diff --git a/gardener/pending-actions.json b/gardener/pending-actions.json index 1dbf2a3..1c89c7d 100644 --- a/gardener/pending-actions.json +++ b/gardener/pending-actions.json @@ -1,52 +1,37 @@ [ { "action": "edit_body", - "issue": 1025, - "body": "## Prior art: PR #1033 (open, branch `fix/issue-1025` retained)\n\nFirst attempt by dev-qwen2 (head `f692dd2`). Test script (`tests/smoke-edge-subpath.sh`, 13.8 KB) and pipeline (`.woodpecker/edge-subpath.yml`) both landed and look reasonable, but the **CI harness design is wrong**: the pipeline boots a bare `alpine:3.19` container and runs the smoke script directly against `BASE_URL=http://localhost`, with no stack to test against.\n\n**This is a harness design gap, not a script bug.** The smoke script itself is a reasonable post-deploy tool — the mistake was trying to exercise it as a hermetic CI step.\n\n**Approach (Option 1 — split the work):**\n\nKeep `tests/smoke-edge-subpath.sh` as an out-of-CI post-deploy tool (accepts `BASE_URL` env var). Replace the CI pipeline step that tries to curl a live stack with static checks only: `shellcheck`, `caddy validate` on the generated Caddyfile, and a template-substitution unit test that verifies routing block shape.\n\nBranch `fix/issue-1025` is preserved at `f692dd2` — the smoke script body is reusable; only the pipeline harness needs a rethink.\n\n**Timeline:**\n- 2026-04-19 09:14 — dev-qwen2 last pushed `f692dd2`\n- 3 pipelines (#1378/#1380/#1382) all fail: no service to curl (connection refused)\n\n## Acceptance criteria\n- [ ] `.woodpecker/edge-subpath.yml` pipeline runs `shellcheck` on `tests/smoke-edge-subpath.sh` with no live service curl\n- [ ] `caddy validate` runs on the generated Caddyfile in CI (template-substitution unit test)\n- [ ] A template-substitution test verifies the Caddyfile routing block shape (forge/ci/staging/chat paths)\n- [ ] `tests/smoke-edge-subpath.sh` accepts `BASE_URL` env var for post-deploy staging runs\n- [ ] CI green (no connection-refused failures on Woodpecker)\n\n## Affected files\n- `.woodpecker/edge-subpath.yml` — pipeline config (static checks only, no service curl)\n- `tests/smoke-edge-subpath.sh` — out-of-CI smoke script (reusable from PR #1033)\n\n## Dependencies\n- #1038 should land first to unblock local edge staging runs (optional — CI fix is independent)" - }, - { - "action": "remove_label", - "issue": 1025, - "label": "blocked" + "issue": 910, + "body": "Flagged by AI reviewer in PR #909.\n\n## Problem\n\n`tools/vault-import.sh` still uses hardcoded `secret/data/${path}` for its curl-based KV write (lines 149, 151, 162, 166, 170). The rest of the codebase was migrated to the configurable `VAULT_KV_MOUNT` variable (defaulting to `kv`) via PR #909. Any deployment with `kv/` as its KV mount will see 403/404 failures when `vault-import.sh` runs.\n\n## Fix\n\nEither:\n1. Refactor the write in `vault-import.sh` to call `hvault_kv_put` (which now respects `VAULT_KV_MOUNT`), or\n2. Replace the hardcoded `secret/data` reference with `${VAULT_KV_MOUNT:-kv}/data` matching the convention in `lib/hvault.sh`.\n\n---\n*Auto-created from AI review*\n\n## Affected files\n\n- `tools/vault-import.sh` (lines 149, 151, 162, 166, 170 — hardcoded `secret/data` references)\n- `lib/hvault.sh` (reference implementation using `VAULT_KV_MOUNT`)\n\n## Acceptance criteria\n\n- [ ] `tools/vault-import.sh` uses `${VAULT_KV_MOUNT:-kv}/data` (or calls `hvault_kv_put`) instead of hardcoded `secret/data`\n- [ ] No hardcoded `secret/data` path references remain in `tools/vault-import.sh`\n- [ ] Vault KV writes succeed when `VAULT_KV_MOUNT=kv` is set (matching the standard deployment config)\n- [ ] `shellcheck` clean\n" }, { "action": "add_label", - "issue": 1025, + "issue": 910, "label": "backlog" }, { "action": "edit_body", - "issue": 1038, - "body": "## Problem\n\n`disinto-edge` crashloops on any deployment that has not opted into the age-encrypted secret store (#777), because the edge entrypoint treats four secrets as unconditionally required:\n\n```\nFATAL: age key (/home/agent/.config/sops/age/keys.txt) or secrets dir (/opt/disinto/secrets) not found — cannot load required secrets\n```\n\nObserved on `disinto-dev-box` (container `disinto-edge`, restarting every ~30s), which blocks PR #1033 (edge-subpath smoke test) and any other work that depends on a running edge.\n\n## Root cause\n\n`docker/edge/entrypoint-edge.sh:176-205` requires:\n\n- `~/.config/sops/age/keys.txt`\n- `/opt/disinto/secrets/` with `.enc` files for `CADDY_SSH_KEY`, `CADDY_SSH_HOST`, `CADDY_SSH_USER`, `CADDY_ACCESS_LOG`.\n\nThese four secrets feed exactly one feature: the daily 23:50 UTC `collect-engagement.sh` cron (#745), which SCPs Caddy access logs from a **remote production edge host** for engagement parsing. On a local factory box or any deployment that has not set up a remote edge, this code path has no target — yet its absence kills the whole edge container.\n\n## Fix\n\nMake the secrets block **optional**. When age key or secrets dir is missing, or any of the four CADDY_ secrets fail to decrypt, log a warning and skip the `collect-engagement` cron loop. Caddy itself does not depend on these secrets and should start normally.\n\nThe concrete edit is around lines 176-205 of `docker/edge/entrypoint-edge.sh` — guard the secret-loading block with a check for the age key and secrets dir, set `EDGE_ENGAGEMENT_READY=0` on failure, and skip cron registration when `EDGE_ENGAGEMENT_READY != 1`.\n\n## Acceptance criteria\n- [ ] `docker/edge/entrypoint-edge.sh` loads CADDY_ secrets optionally — missing age key or secrets dir logs a warning and continues, does not FATAL\n- [ ] Caddy starts normally when CADDY_ secrets are absent\n- [ ] `collect-engagement` cron is skipped (not registered) when engagement secrets are unavailable\n- [ ] On deployments WITH secrets configured, behavior is unchanged (collect-engagement cron still fires at 23:50 UTC)\n- [ ] CI green\n\n## Affected files\n- `docker/edge/entrypoint-edge.sh` — lines 176-205, secrets loading block made optional" - }, - { - "action": "remove_label", - "issue": 1038, - "label": "blocked" + "issue": 914, + "body": "Flagged by AI reviewer in PR #911.\n\n## Problem\n\n`lib/generators.sh` fixes the `agents` service missing `pull_policy: build` in `--build` mode (PR #893), but the `edge` service has the same root cause: the sed replacement at line 664 produces `build: ./docker/edge` with no `pull_policy: build`. Without it, `docker compose up -d --force-recreate` reuses the cached edge image and silently keeps running stale code even after source changes.\n\n## Fix\n\nAdd `\\n pull_policy: build` to the edge sed replacement, matching the pattern applied to agents in PR #893.\n\n---\n*Auto-created from AI review*\n\n## Affected files\n\n- `lib/generators.sh` (line 664 — edge service sed replacement missing `pull_policy: build`)\n\n## Acceptance criteria\n\n- [ ] `lib/generators.sh` edge service block emits `pull_policy: build` when `--build` mode is active (matching the pattern from PR #893 for the agents service)\n- [ ] `docker compose up -d --force-recreate` after source changes rebuilds the edge image rather than using the cached layer\n- [ ] Generated `docker-compose.yml` edge service stanza contains `pull_policy: build`\n- [ ] `shellcheck` clean\n" }, { "action": "add_label", - "issue": 1038, + "issue": 914, "label": "backlog" }, { "action": "edit_body", - "issue": 850, - "body": "## Problem\n\nWhen the compose generator emits the same service name twice — e.g. both the legacy `ENABLE_LLAMA_AGENT=1` branch and a matching `[agents.llama]` TOML block produce an `agents-llama:` key — the failure is deferred all the way to `docker compose` YAML parsing:\n\n```\nfailed to parse /home/johba/disinto/docker-compose.yml: yaml: construct errors:\n line 4: line 431: mapping key \"agents-llama\" already defined at line 155\n```\n\nBy then, the user has already paid the cost of: pre-build binary downloads, generator run, Caddyfile regeneration. The only hint about what went wrong is a line number in a generated file. Root cause (dual activation) is not surfaced.\n\n## Fix\n\nAdd a generate-time guard to `lib/generators.sh`:\n\n- After collecting all service blocks to emit, compare the set of service names against duplicates.\n- If a duplicate is detected, abort with a clear message naming both sources of truth (e.g. `\"agents-llama\" emitted twice — from ENABLE_LLAMA_AGENT=1 and from [agents.llama] in projects/disinto.toml; remove one`).\n\n## Prior art: PR #872 (closed, branch `fix/issue-850` retained)\n\ndev-qwen's first attempt (`db009e3`) landed the dup-detection logic in `lib/generators.sh` correctly (unit test `tests/test-duplicate-service-detection.sh` passes all 3 cases), but the smoke test fails on CI.\n\n**Why the smoke test fails:** sections 1-7 of `smoke-init.sh` already run `bin/disinto init`, materializing `docker-compose.yml`. Section 8 re-invokes `bin/disinto init` to verify the dup guard fires — but `_generate_compose_impl` early-returns with `\"Compose: already exists, skipping\"` before reaching the dup-check.\n\n**Suggested fix:** in `tests/smoke-init.sh` section 8 (around line 452, before the second `bin/disinto init` invocation), add:\n\n```bash\nrm -f \"${FACTORY_ROOT}/docker-compose.yml\"\n```\n\nso the generator actually runs and the dup-detection path is exercised. Do **not** hoist the dup-check above the early-return.\n\nThe branch `fix/issue-850` is preserved as a starting point — pick up from `db009e3` and patch the smoke-test cleanup.\n\nRelated: #846.\n\n## Acceptance criteria\n- [ ] `bin/disinto init` with a config that would produce duplicate service names aborts with a clear error message naming both sources (e.g. `ENABLE_LLAMA_AGENT=1` and `[agents.llama]` TOML block)\n- [ ] `tests/smoke-init.sh` section 8 removes `docker-compose.yml` before re-invoking `disinto init` so the dup guard is exercised\n- [ ] Unit test `tests/test-duplicate-service-detection.sh` passes all 3 cases\n- [ ] CI green (smoke-init.sh section 8 no longer skips dup detection)\n\n## Affected files\n- `lib/generators.sh` — duplicate service name check after collecting all service blocks\n- `tests/smoke-init.sh` — section 8: add `rm -f \\${FACTORY_ROOT}/docker-compose.yml` before second `disinto init`" - }, - { - "action": "remove_label", - "issue": 850, - "label": "blocked" + "issue": 867, + "body": "## Incident\n\n**2026-04-16 ~10:55–11:52 UTC.** Woodpecker CI agent (`disinto-woodpecker-agent`) entered a repeated gRPC-error crashloop (Codeberg #813 class — gRPC-in-nested-docker). Every workflow it accepted exited 1 within seconds, never actually running pipeline steps.\n\n**Blast radius:** dev-qwen took issue #842 at 10:55, opened PR #859, and burned its full 3-attempt `pr-lifecycle` CI-fix budget between 10:55 and 11:08 reacting to these infra-flake \"CI failures.\" Each failure arrived in ~30–60 seconds, too fast to be a real test run. After exhausting the budget, dev-qwen marked #842 as `blocked: ci_exhausted` and moved on. No real bug was being detected; the real failure surfaced later only after an operator restarted the WP agent and manually retriggered pipeline #966 — which then returned a legitimate `bats-init-nomad` failure in test #6 (different issue).\n\n**Root cause of the infra-flake:** gRPC-in-nested-docker bug, Woodpecker server ↔ agent comms inside nested containers. Known-flaky; restart of `disinto-woodpecker-agent` clears it.\n\n**Recovery:** operator `docker restart disinto-woodpecker-agent` + retrigger pipelines via WP API POST `/api/repos/2/pipelines/`. Fresh run reached real stage signal.\n\n## Why this burned dev-qwen's budget\n\n`pr-lifecycle`'s CI-fix budget treats every failed commit-status as a signal to invoke the agent. It has no notion of \"infra flake\" vs. \"real test failure\" and no heuristic to distinguish them. Four infra-flake failures in 13 minutes looked identical to four real code-bug failures.\n\n## Suggestions — what supervisor can check every 20min\n\nSupervisor runs every `1200s` already. Add these probes:\n\n**1. WP agent container health.**\n```\ndocker inspect disinto-woodpecker-agent --format '{{.State.Health.Status}}'\n```\nIf `unhealthy` for the second consecutive supervisor tick → **restart it automatically + post a comment on any currently-running dev-bot/dev-qwen issues warning \"CI agent was restarted; subsequent failures before this marker may be infra-flake.\"**\n\n**2. Fast-failure heuristic on WP pipelines.**\nQuery WP API `GET /api/repos/2/pipelines?page=1`. For each pipeline in state `failure`, compute `finished - started`. If duration < 60s, flag as probable infra-flake. Three flagged flakes within a 15-min window → trigger agent restart as in (1) and a bulk-retrigger via POST `/api/repos/2/pipelines/` for each.\n\n**3. grpc error pattern in agent log.**\n`docker logs --since 20m disinto-woodpecker-agent 2>&1 | grep -c 'grpc error'` — if ≥3 matches, agent is probably wedged. Trigger restart as in (1).\n\n**4. Issue-level guard.**\nWhen supervisor detects an agent restart, scan for issues updated in the preceding 30min with label `blocked: ci_exhausted` and for each one:\n- unassign + remove `blocked` label (return to pool)\n- comment on the issue: *\"CI agent was unhealthy between HH:MM and HH:MM — prior 3/3 retry budget may have been spent on infra flake, not real failures. Re-queueing for a fresh attempt.\"*\n- retrigger the PR's latest WP pipeline\n\nThis last step is the key correction: **`ci_exhausted` preceded by WP-agent-unhealth = false positive; return to pool with context.**\n\n## Why this matters for the migration\n\nBetween now and cutover every WP CI flake that silently exhausts an agent's budget steals hours of clock time. Without an automatic recovery path, the pace of the step-N backlogs falls off a cliff the moment the agent next goes unhealthy — and it *will* go unhealthy again (Codeberg #813 is not fixed upstream yet).\n\n## Fix for this specific incident (already applied manually)\n\n- Restarted `disinto-woodpecker-agent`.\n- Closed PR #859 (kept branch `fix/issue-842` at `64080232`).\n- Unassigned dev-qwen from #842, removed `blocked` label, appended prior-art section + pipeline #966 test-#6 failure details to issue body so the next claimant starts with full context.\n\n## Non-goals\n\n- Not trying to fix Codeberg #813 itself (upstream gRPC-in-nested-docker issue).\n- Not trying to fix `pr-lifecycle`'s budget logic — the supervisor-side detection is cheaper and more robust than per-issue budget changes.\n\n## Labels / meta\n\n- `bug-report` + supervisor-focused. Classify severity as blocker for the migration cadence (not for factory day-to-day — it only bites when an unfixable-by-dev issue hits the budget).\n\n## Affected files\n\n- `supervisor/supervisor-run.sh` — add WP agent health probes and flake-detection logic\n- `supervisor/preflight.sh` — may need additional data collection for WP agent health status\n\n## Acceptance criteria\n\n- [ ] Supervisor detects an unhealthy `disinto-woodpecker-agent` container (via `docker inspect` health status or gRPC error log count ≥ 3) and automatically restarts it\n- [ ] After an auto-restart, supervisor scans for issues updated in the prior 30 min labeled `blocked: ci_exhausted` and returns them to the pool (unassign, remove `blocked`, add comment noting infra-flake window)\n- [ ] Fast-failure heuristic: pipelines completing in <60s are flagged as probable infra-flake; 3+ in a 15-min window triggers the restart+retrigger flow\n- [ ] Already-swept PRs/issues are not processed twice (idempotency guard via `` comment)\n- [ ] CI green\n" }, { "action": "add_label", - "issue": 850, + "issue": 867, "label": "backlog" }, { - "action": "comment", - "issue": 758, - "body": "This issue is the critical path blocker for #820 (ops repo re-seed) and #982 (collect-engagement commit fix). Both are in the backlog and ready to merge, but cannot run until ops repo branch protection is resolved. Needs admin/human action to change Forgejo branch protection settings on disinto-ops — no code change can unblock this." + "action": "add_label", + "issue": 820, + "label": "backlog" } ] diff --git a/lib/AGENTS.md b/lib/AGENTS.md index b54f5cb..97e6f5e 100644 --- a/lib/AGENTS.md +++ b/lib/AGENTS.md @@ -1,4 +1,4 @@ - + # Shared Helpers (`lib/`) All agents source `lib/env.sh` as their first action. Additional helpers are @@ -30,9 +30,9 @@ sourced as needed. | `lib/git-creds.sh` | Shared git credential helper configuration. `configure_git_creds([HOME_DIR] [RUN_AS_CMD])` — writes a static credential helper script and configures git globally to use password-based HTTP auth (Forgejo 11.x rejects API tokens for `git push`, #361). **Retry on cold boot (#741)**: resolves bot username from `FORGE_TOKEN` with 5 retries (exponential backoff 1-5s); fails loudly and returns 1 if Forgejo is unreachable — never falls back to a wrong hardcoded default (exports `BOT_USER` on success). `repair_baked_cred_urls([--as RUN_AS_CMD] DIR ...)` — rewrites any git remote URLs that have credentials baked in to use clean URLs instead; uses `safe.directory` bypass for root-owned repos (#671). Requires `FORGE_PASS`, `FORGE_URL`, `FORGE_TOKEN`. | entrypoints (agents, edge) | | `lib/ops-setup.sh` | `setup_ops_repo()` — creates ops repo on Forgejo if it doesn't exist, configures bot collaborators, clones/initializes ops repo locally, seeds directory structure (vault, knowledge, evidence, sprints). Evidence subdirectories seeded: engagement/, red-team/, holdout/, evolution/, user-test/. Also seeds sprints/ for architect output. Exports `_ACTUAL_OPS_SLUG`. `migrate_ops_repo(ops_root, [primary_branch])` — idempotent migration helper that seeds missing directories and .gitkeep files on existing ops repos (pre-#407 deployments). | bin/disinto (init) | | `lib/ci-setup.sh` | `_install_cron_impl()` — installs crontab entries for bare-metal deployments (compose mode uses polling loop instead). `_create_forgejo_oauth_app()` — generic helper to create an OAuth2 app on Forgejo (shared by Woodpecker and chat). `_create_woodpecker_oauth_impl()` — creates Woodpecker OAuth2 app (thin wrapper). `_create_chat_oauth_impl()` — creates disinto-chat OAuth2 app, writes `CHAT_OAUTH_CLIENT_ID`/`CHAT_OAUTH_CLIENT_SECRET` to `.env` (#708). `_generate_woodpecker_token_impl()` — auto-generates WOODPECKER_TOKEN via OAuth2 flow. `_activate_woodpecker_repo_impl()` — activates repo in Woodpecker. All gated by `_load_ci_context()` which validates required env vars. | bin/disinto (init) | -| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (uses `codeberg.org/forgejo/forgejo:11.0` tag; `CLAUDE_BIN_DIR` volume mount removed from agents/llama services — only `reproduce` and `edge` still use the host-mounted CLI (#992); adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) | +| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (uses `codeberg.org/forgejo/forgejo:11.0` tag; adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) | | `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses ` ... ` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) | | `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) | | `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) | -| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_ensure_kv_v2(MOUNT, [LOG_PREFIX])` — idempotent KV v2 mount assertion: enables mount if absent, fails loudly if present as wrong type/version. Extracted from all `vault-seed-*.sh` scripts to eliminate dup-detector violations. Respects `DRY_RUN=1`. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. `_hvault_seed_key(PATH, KEY, [GENERATOR])` — seed one KV key if absent; reads existing data and merges to preserve sibling keys (KV v2 replaces atomically); returns 0=created, 1=unchanged, 2=API error (#992). All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh`, `tools/vault-seed-*.sh` | -| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling; `HOST_VOLUME_DIRS` array now includes `/srv/disinto/docker` (for staging file-server, S5.2, #989, #992). `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). `wp-oauth-register.sh` — S3.3 script that creates the Woodpecker OAuth2 app in Forgejo and stores `forgejo_client`/`forgejo_secret` in Vault KV v2 at `kv/disinto/shared/woodpecker`; idempotent (skips if app or secrets already present); called by `bin/disinto --with woodpecker`. `deploy.sh` — S4 dependency-ordered Nomad job deploy + health-wait; takes a list of jobspec basenames, submits each to Nomad and polls until healthy before proceeding to the next; supports `--dry-run` and per-job timeout overrides via `JOB_READY_TIMEOUT_`; global default timeout `JOB_READY_TIMEOUT_SECS` is 360s (raised from 240s for chat cold-start, #1036); invoked by `bin/disinto --with ` and `cluster-up.sh`; deploy order now covers staging, chat, edge (S5.5, #992). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` | +| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh` | +| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling. `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` | diff --git a/lib/generators.sh b/lib/generators.sh index 77af9a7..9ec8444 100644 --- a/lib/generators.sh +++ b/lib/generators.sh @@ -137,6 +137,7 @@ _generate_local_model_services() { - project-repos-${service_name}:/home/agent/repos - \${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:\${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - \${CLAUDE_CONFIG_FILE:-\${HOME}/.claude.json}:/home/agent/.claude.json:ro + - \${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - \${AGENT_SSH_DIR:-\${HOME}/.ssh}:/home/agent/.ssh:ro - ./projects:/home/agent/disinto/projects:ro - ./.env:/home/agent/disinto/.env:ro @@ -381,6 +382,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -634,13 +636,13 @@ COMPOSEEOF _generate_local_model_services "$compose_file" # Resolve the Claude CLI binary path and persist as CLAUDE_BIN_DIR in .env. - # Only used by reproduce and edge services which still use host-mounted CLI. + # docker-compose.yml references ${CLAUDE_BIN_DIR} so the value must be set. local claude_bin claude_bin="$(command -v claude 2>/dev/null || true)" if [ -n "$claude_bin" ]; then claude_bin="$(readlink -f "$claude_bin")" else - echo "Warning: claude CLI not found in PATH — reproduce/edge services will fail to start" >&2 + echo "Warning: claude CLI not found in PATH — set CLAUDE_BIN_DIR in .env manually" >&2 claude_bin="/usr/local/bin/claude" fi # Persist CLAUDE_BIN_DIR into .env so docker-compose can resolve it. @@ -657,6 +659,7 @@ COMPOSEEOF # In build mode, replace image: with build: for locally-built images if [ "$use_build" = true ]; then + sed -i 's|^\( agents:\)|\1|' "$compose_file" sed -i '/^ image: ghcr\.io\/disinto\/agents:/{s|image: ghcr\.io/disinto/agents:.*|build:\n context: .\n dockerfile: docker/agents/Dockerfile\n pull_policy: build|}' "$compose_file" sed -i '/^ image: ghcr\.io\/disinto\/edge:/{s|image: ghcr\.io/disinto/edge:.*|build: ./docker/edge\n pull_policy: build|}' "$compose_file" fi diff --git a/lib/hvault.sh b/lib/hvault.sh index d283330..b0d1635 100644 --- a/lib/hvault.sh +++ b/lib/hvault.sh @@ -405,36 +405,3 @@ hvault_token_lookup() { return 1 } } - -# _hvault_seed_key — Seed a single KV key if it doesn't exist. -# Reads existing data and merges to preserve sibling keys (KV v2 replaces -# .data atomically). Returns 0=created, 1=unchanged, 2=API error. -# Args: -# path: KV v2 logical path (e.g. "disinto/shared/chat") -# key: key name within the path (e.g. "chat_oauth_client_id") -# generator: shell command that outputs a random value (default: openssl rand -hex 32) -# Usage: -# _hvault_seed_key "disinto/shared/chat" "chat_oauth_client_id" -# rc=$? # 0=created, 1=unchanged -_hvault_seed_key() { - local path="$1" key="$2" generator="${3:-openssl rand -hex 32}" - local existing - existing=$(hvault_kv_get "$path" "$key" 2>/dev/null) || true - if [ -n "$existing" ]; then - return 1 # unchanged - fi - - local value - value=$(eval "$generator") - - # Read existing data to preserve sibling keys (KV v2 replaces atomically) - local kv_api="${VAULT_KV_MOUNT}/data/${path}" - local raw existing_data payload - raw="$(hvault_get_or_empty "$kv_api")" || return 2 - existing_data="{}" - [ -n "$raw" ] && existing_data="$(printf '%s' "$raw" | jq '.data.data // {}')" - payload="$(printf '%s' "$existing_data" \ - | jq --arg k "$key" --arg v "$value" '{data: (. + {($k): $v})}')" - _hvault_request POST "$kv_api" "$payload" >/dev/null - return 0 # created -} diff --git a/lib/init/nomad/cluster-up.sh b/lib/init/nomad/cluster-up.sh index 488d2df..4aab42d 100755 --- a/lib/init/nomad/cluster-up.sh +++ b/lib/init/nomad/cluster-up.sh @@ -66,7 +66,6 @@ HOST_VOLUME_DIRS=( "/srv/disinto/agent-data" "/srv/disinto/project-repos" "/srv/disinto/caddy-data" - "/srv/disinto/docker" "/srv/disinto/chat-history" "/srv/disinto/ops-repo" ) @@ -117,7 +116,7 @@ if [ "$dry_run" = true ]; then [dry-run] Step 4/9: create host-volume dirs under /srv/disinto/ EOF for d in "${HOST_VOLUME_DIRS[@]}"; do - printf ' → install -d -m 0777 %s\n' "$d" + printf ' → install -d -m 0755 %s\n' "$d" done cat < — per-job timeout override (e.g., # JOB_READY_TIMEOUT_FORGEJO=300) # @@ -33,7 +33,7 @@ set -euo pipefail # ── Configuration ──────────────────────────────────────────────────────────── SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}" -JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}" +JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-240}" DRY_RUN=0 @@ -177,8 +177,7 @@ for job_name in "${JOBS[@]}"; do fi # Per-job timeout override: JOB_READY_TIMEOUT_ - # Sanitize job name: replace hyphens with underscores (bash vars can't have hyphens) - job_upper=$(printf '%s' "$job_name" | tr '[:lower:]-' '[:upper:]_' | tr ' ' '_') + job_upper=$(printf '%s' "$job_name" | tr '[:lower:]' '[:upper:]') timeout_var="JOB_READY_TIMEOUT_${job_upper}" job_timeout="${!timeout_var:-$JOB_READY_TIMEOUT_SECS}" diff --git a/lib/init/nomad/wp-oauth-register.sh b/lib/init/nomad/wp-oauth-register.sh index 8076482..9b7f12a 100755 --- a/lib/init/nomad/wp-oauth-register.sh +++ b/lib/init/nomad/wp-oauth-register.sh @@ -43,8 +43,8 @@ set -euo pipefail # Source the hvault module for Vault helpers SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" -# shellcheck source=../../../lib/hvault.sh +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +# shellcheck source=../../lib/hvault.sh source "${REPO_ROOT}/lib/hvault.sh" # Configuration diff --git a/nomad/AGENTS.md b/nomad/AGENTS.md index bf62f45..f57c30a 100644 --- a/nomad/AGENTS.md +++ b/nomad/AGENTS.md @@ -1,27 +1,21 @@ - + # nomad/ — Agent Instructions Nomad + Vault HCL for the factory's single-node cluster. These files are the source of truth that `lib/init/nomad/cluster-up.sh` copies onto a factory box under `/etc/nomad.d/` and `/etc/vault.d/` at init time. -This directory covers the **Nomad+Vault migration (Steps 0–5)** — -see issues #821–#992 for the step breakdown. +This directory covers the **Nomad+Vault migration (Steps 0–2)** — +see issues #821–#884 for the step breakdown. ## What lives here | File/Dir | Deployed to | Owned by | |---|---|---| | `server.hcl` | `/etc/nomad.d/server.hcl` | agent role, bind, ports, `data_dir` (S0.2) | -| `client.hcl` | `/etc/nomad.d/client.hcl` | Docker driver cfg + `host_volume` declarations (S0.2); `allow_privileged = true` for woodpecker-agent Docker-in-Docker (S3-fix-5, #961) | +| `client.hcl` | `/etc/nomad.d/client.hcl` | Docker driver cfg + `host_volume` declarations (S0.2) | | `vault.hcl` | `/etc/vault.d/vault.hcl` | Vault storage, listener, UI, `disable_mlock` (S0.3) | | `jobs/forgejo.hcl` | submitted via `lib/init/nomad/deploy.sh` | Forgejo job; reads creds from Vault via consul-template stanza (S2.4) | -| `jobs/woodpecker-server.hcl` | submitted via `lib/init/nomad/deploy.sh` | Woodpecker CI server; host networking, Vault KV for `WOODPECKER_AGENT_SECRET` + Forgejo OAuth creds (S3.1) | -| `jobs/woodpecker-agent.hcl` | submitted via `lib/init/nomad/deploy.sh` | Woodpecker CI agent; host networking, `docker.sock` mount, Vault KV for `WOODPECKER_AGENT_SECRET`; `WOODPECKER_SERVER` uses `${attr.unique.network.ip-address}:9000` (Nomad interpolation) — port binds to LXC alloc IP, not localhost (S3.2, S3-fix-6, #964) | -| `jobs/agents.hcl` | submitted via `lib/init/nomad/deploy.sh` | All 7 agent roles (dev, review, gardener, planner, predictor, supervisor, architect) + llama variant; Vault-templated bot tokens via `service-agents` policy; `force_pull = false` — image is built locally by `bin/disinto --with agents`, no registry (S4.1, S4-fix-2, S4-fix-5, #955, #972, #978) | -| `jobs/staging.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy file-server mounting `docker/` as `/srv/site:ro`; no Vault integration; **dynamic host port** (no static 80 — edge owns 80/443, collision fixed in S5-fix-7 #1018); edge discovers via Nomad service registration (S5.2, #989) | -| `jobs/chat.hcl` | submitted via `lib/init/nomad/deploy.sh` | Claude chat UI; custom `disinto/chat:local` image; sandbox hardening (cap_drop ALL, **tmpfs via mount block** not `tmpfs=` arg — S5-fix-5 #1012, pids_limit 128); Vault-templated OAuth secrets via `service-chat` policy (S5.2, #989) | -| `jobs/edge.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy reverse proxy + dispatcher sidecar; routes /forge, /woodpecker, /staging, /chat; uses `disinto/edge:local` image built by `bin/disinto --with edge`; **both Caddy and dispatcher tasks use `network_mode = "host"`** — upstreams are `127.0.0.1:` (forgejo :3000, woodpecker :8000, chat :8080), not Docker hostnames (#1031, #1034); `FORGE_URL` rendered via Nomad service discovery template (not static env) to handle bridge vs. host network differences (#1034); dispatcher Vault secret path changed to `kv/data/disinto/shared/ops-repo` (#1041); Vault-templated ops-repo creds via `service-dispatcher` policy (S5.1, #988) | Nomad auto-merges every `*.hcl` under `-config=/etc/nomad.d/`, so the split between `server.hcl` and `client.hcl` is for readability, not @@ -36,6 +30,8 @@ convention, KV path summary, and JWT-auth role bindings (S2.1/S2.3). ## Not yet implemented +- **Additional jobspecs** (woodpecker, agents, caddy) — Step 1 brought up + Forgejo; remaining services land in later steps. - **TLS, ACLs, gossip encryption** — deliberately absent for now; land alongside multi-node support. diff --git a/nomad/client.hcl b/nomad/client.hcl index d173ed5..b90d5c1 100644 --- a/nomad/client.hcl +++ b/nomad/client.hcl @@ -49,12 +49,6 @@ client { read_only = false } - # staging static content (docker/ directory with images, HTML, etc.) - host_volume "site-content" { - path = "/srv/disinto/docker" - read_only = true - } - # disinto chat transcripts + attachments. host_volume "chat-history" { path = "/srv/disinto/chat-history" @@ -70,11 +64,11 @@ client { # Docker task driver. `volumes.enabled = true` is required so jobspecs # can mount host_volume declarations defined above. `allow_privileged` -# is true — woodpecker-agent requires `privileged = true` to access -# docker.sock and spawn CI pipeline containers. +# stays false — no factory workload needs privileged containers today, +# and flipping it is an audit-worthy change. plugin "docker" { config { - allow_privileged = true + allow_privileged = false volumes { enabled = true diff --git a/nomad/jobs/agents.hcl b/nomad/jobs/agents.hcl deleted file mode 100644 index 92d377e..0000000 --- a/nomad/jobs/agents.hcl +++ /dev/null @@ -1,207 +0,0 @@ -# ============================================================================= -# nomad/jobs/agents.hcl — All-role agent polling loop (Nomad service job) -# -# Part of the Nomad+Vault migration (S4.1, issue #955). Runs the main bot -# polling loop with all 7 agent roles (review, dev, gardener, architect, -# planner, predictor, supervisor) against the local llama server. -# -# Host_volume contract: -# This job mounts agent-data, project-repos, and ops-repo from -# nomad/client.hcl. Paths under /srv/disinto/* are created by -# lib/init/nomad/cluster-up.sh before any job references them. -# -# Vault integration (S4.1): -# - vault { role = "service-agents" } at group scope — workload-identity -# JWT exchanged for a Vault token carrying the composite service-agents -# policy (vault/policies/service-agents.hcl), which grants read access -# to all 7 bot KV namespaces + vault bot + shared forge config. -# - template stanza renders per-bot FORGE_*_TOKEN + FORGE_PASS from Vault -# KV v2 at kv/disinto/bots/. -# - Seeded on fresh boxes by tools/vault-seed-agents.sh. -# -# Not the runtime yet: docker-compose.yml is still the factory's live stack -# until cutover. This file exists so CI can validate it and S4.2 can wire -# `disinto init --backend=nomad --with agents` to `nomad job run` it. -# ============================================================================= - -job "agents" { - type = "service" - datacenters = ["dc1"] - - group "agents" { - count = 1 - - # ── Vault workload identity (S4.1, issue #955) ─────────────────────────── - # Composite role covering all 7 bot identities + vault bot. Role defined - # in vault/roles.yaml, policy in vault/policies/service-agents.hcl. - # Bound claim pins nomad_job_id = "agents". - vault { - role = "service-agents" - } - - # No network port — agents are outbound-only (poll forgejo, call llama). - # No service discovery block — nothing health-checks agents over HTTP. - - volume "agent-data" { - type = "host" - source = "agent-data" - read_only = false - } - - volume "project-repos" { - type = "host" - source = "project-repos" - read_only = false - } - - volume "ops-repo" { - type = "host" - source = "ops-repo" - read_only = true - } - - # Conservative restart — fail fast to the scheduler. - restart { - attempts = 3 - interval = "5m" - delay = "15s" - mode = "delay" - } - - # ── Service registration ──────────────────────────────────────────────── - # Agents are outbound-only (poll forgejo, call llama) — no HTTP/TCP - # endpoint to probe. The Nomad native provider only supports tcp/http - # checks, not script checks. Registering without a check block means - # Nomad tracks health via task lifecycle: task running = healthy, - # task dead = service deregistered. This matches the docker-compose - # pgrep healthcheck semantics (process alive = healthy). - service { - name = "agents" - provider = "nomad" - } - - task "agents" { - driver = "docker" - - config { - image = "disinto/agents:local" - force_pull = false - - # apparmor=unconfined matches docker-compose — Claude Code needs - # ptrace for node.js inspector and /proc access. - security_opt = ["apparmor=unconfined"] - } - - volume_mount { - volume = "agent-data" - destination = "/home/agent/data" - read_only = false - } - - volume_mount { - volume = "project-repos" - destination = "/home/agent/repos" - read_only = false - } - - volume_mount { - volume = "ops-repo" - destination = "/home/agent/repos/_factory/disinto-ops" - read_only = true - } - - # ── Non-secret env ───────────────────────────────────────────────────── - env { - FORGE_URL = "http://forgejo:3000" - FORGE_REPO = "disinto-admin/disinto" - ANTHROPIC_BASE_URL = "http://10.10.10.1:8081" - ANTHROPIC_API_KEY = "sk-no-key-required" - CLAUDE_MODEL = "unsloth/Qwen3.5-35B-A3B" - AGENT_ROLES = "review,dev,gardener,architect,planner,predictor,supervisor" - POLL_INTERVAL = "300" - DISINTO_CONTAINER = "1" - PROJECT_NAME = "project" - PROJECT_REPO_ROOT = "/home/agent/repos/project" - CLAUDE_TIMEOUT = "7200" - - # llama-specific Claude Code tuning - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC = "1" - CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS = "1" - CLAUDE_AUTOCOMPACT_PCT_OVERRIDE = "60" - } - - # ── Vault-templated bot tokens (S4.1, issue #955) ───────────────────── - # Renders per-bot FORGE_*_TOKEN + FORGE_PASS from Vault KV v2. - # Each `with secret ...` block reads one bot's KV path; the `else` - # branch emits short placeholders on fresh installs where the path - # is absent. Seed with tools/vault-seed-agents.sh. - # - # Placeholder values kept < 16 chars to avoid secret-scan CI failures. - # error_on_missing_key = false prevents template-pending hangs. - template { - destination = "secrets/bots.env" - env = true - change_mode = "restart" - error_on_missing_key = false - data = < (forgejo :3000, woodpecker :8000, chat :8080). -# Staging uses Nomad service discovery (S5-fix-7, issue #1018). -# -# Host_volume contract: -# This job mounts caddy-data from nomad/client.hcl. Path -# /srv/disinto/caddy-data is created by lib/init/nomad/cluster-up.sh before -# any job references it. Keep the `source = "caddy-data"` below in sync -# with the host_volume stanza in client.hcl. -# -# Build step (S5.1): -# docker/edge/Dockerfile is custom (adds bash, jq, curl, git, docker-cli, -# python3, openssh-client, autossh to caddy:latest). Build as -# disinto/edge:local using the same pattern as disinto/agents:local. -# Command: docker build -t disinto/edge:local -f docker/edge/Dockerfile docker/edge -# -# Not the runtime yet: docker-compose.yml is still the factory's live stack -# until cutover. This file exists so CI can validate it and S5.2 can wire -# `disinto init --backend=nomad --with edge` to `nomad job run` it. -# ============================================================================= - -job "edge" { - type = "service" - datacenters = ["dc1"] - - group "edge" { - count = 1 - - # ── Vault workload identity for dispatcher (S5.1, issue #988) ────────── - # Service role for dispatcher task to fetch vault actions from KV v2. - # Role defined in vault/roles.yaml, policy in vault/policies/dispatcher.hcl. - vault { - role = "service-dispatcher" - } - - # ── Network ports (S5.1, issue #988) ────────────────────────────────── - # Caddy listens on :80 and :443. Expose both on the host. - network { - port "http" { - static = 80 - to = 80 - } - - port "https" { - static = 443 - to = 443 - } - } - - # ── Host-volume mounts (S5.1, issue #988) ───────────────────────────── - # caddy-data: ACME certificates, Caddy config state. - volume "caddy-data" { - type = "host" - source = "caddy-data" - read_only = false - } - - # ops-repo: disinto-ops clone for vault actions polling. - volume "ops-repo" { - type = "host" - source = "ops-repo" - read_only = false - } - - # ── Conservative restart policy ─────────────────────────────────────── - # Caddy should be stable; dispatcher may restart on errors. - restart { - attempts = 3 - interval = "5m" - delay = "15s" - mode = "delay" - } - - # ── Service registration ─────────────────────────────────────────────── - # Caddy is an HTTP reverse proxy — health check on port 80. - service { - name = "edge" - port = "http" - provider = "nomad" - - check { - type = "http" - path = "/" - interval = "10s" - timeout = "3s" - } - } - - # ── Caddy task (S5.1, issue #988) ───────────────────────────────────── - task "caddy" { - driver = "docker" - - config { - # Use pre-built disinto/edge:local image (custom Dockerfile adds - # bash, jq, curl, git, docker-cli, python3, openssh-client, autossh). - image = "disinto/edge:local" - force_pull = false - network_mode = "host" - ports = ["http", "https"] - - # apparmor=unconfined matches docker-compose — needed for autossh - # in the entrypoint script. - security_opt = ["apparmor=unconfined"] - } - - # Mount caddy-data volume for ACME state and config directory. - # Caddyfile is mounted at /etc/caddy/Caddyfile by entrypoint-edge.sh. - volume_mount { - volume = "caddy-data" - destination = "/data" - read_only = false - } - - # ── Caddyfile via Nomad service discovery (S5-fix-7, issue #1018) ──── - # Renders staging upstream from Nomad service registration instead of - # hardcoded staging:80. Caddy picks up /local/Caddyfile via entrypoint. - # Forge URL via Nomad service discovery (issue #1034) — resolves forgejo - # service address/port dynamically for bridge network compatibility. - template { - destination = "local/forge.env" - env = true - change_mode = "restart" - data = < path. Roles defined in - # vault/roles.yaml (runner-), policies in vault/policies/. - vault {} - - volume "ops-repo" { - type = "host" - source = "ops-repo" - read_only = true - } - - # No restart for batch — fail fast, let the dispatcher handle retries. - restart { - attempts = 0 - mode = "fail" - } - - task "runner" { - driver = "docker" - - config { - image = "disinto/agents:local" - force_pull = false - entrypoint = ["bash"] - args = [ - "/home/agent/disinto/docker/runner/entrypoint-runner.sh", - "${NOMAD_META_action_id}", - ] - } - - volume_mount { - volume = "ops-repo" - destination = "/home/agent/ops" - read_only = true - } - - # ── Non-secret env ─────────────────────────────────────────────────────── - env { - DISINTO_CONTAINER = "1" - FACTORY_ROOT = "/home/agent/disinto" - OPS_REPO_ROOT = "/home/agent/ops" - } - - # ── Vault-templated runner secrets (approach A) ──────────────────────── - # Pre-defined templates for all 6 known runner secrets. Each renders - # from kv/data/disinto/runner/. Secrets not granted by the - # dispatch's Vault policies produce empty env vars (harmless). - # error_on_missing_key = false prevents template-pending hangs when - # a secret path is absent or the policy doesn't grant access. - # - # Placeholder values kept < 16 chars to avoid secret-scan CI failures. - template { - destination = "secrets/runner.env" - env = true - error_on_missing_key = false - data = < + # Planner Agent **Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints), diff --git a/predictor/AGENTS.md b/predictor/AGENTS.md index a263066..cec03a1 100644 --- a/predictor/AGENTS.md +++ b/predictor/AGENTS.md @@ -1,4 +1,4 @@ - + # Predictor Agent **Role**: Abstract adversary (the "goblin"). Runs a 2-step formula diff --git a/review/AGENTS.md b/review/AGENTS.md index 24606d1..4c06b34 100644 --- a/review/AGENTS.md +++ b/review/AGENTS.md @@ -1,4 +1,4 @@ - + # Review Agent **Role**: AI-powered PR review — post structured findings and formal diff --git a/supervisor/AGENTS.md b/supervisor/AGENTS.md index 23a3832..77f7b64 100644 --- a/supervisor/AGENTS.md +++ b/supervisor/AGENTS.md @@ -1,4 +1,4 @@ - + # Supervisor Agent **Role**: Health monitoring and auto-remediation, executed as a formula-driven @@ -24,18 +24,12 @@ Both invoke the same `supervisor-run.sh`. Sources `lib/guard.sh` and calls `chec files for `PHASE:escalate` entries and auto-removes any whose linked issue is confirmed closed (24h grace period after closure to avoid races). Reports **stale crashed worktrees** (worktrees preserved after crash) — supervisor - housekeeping removes them after 24h. Collects **Woodpecker agent health** - (added #933): container `disinto-woodpecker-agent` health/running status, - gRPC error count in last 20 min, fast-failure pipeline count (<60s, last 15 min), - and overall health verdict (healthy/unhealthy). Unhealthy verdict triggers - automatic container restart + `blocked:ci_exhausted` issue recovery in - `supervisor-run.sh` before the Claude session starts. + housekeeping removes them after 24h. Also collects **Woodpecker agent health**: + container status, gRPC error count (last 20m), fast-failure pipelines (<60s, + last 15m), and overall health determination. - `formulas/run-supervisor.toml` — Execution spec: five steps (preflight review, health-assessment, decide-actions, report, journal) with `needs` dependencies. - Claude evaluates all metrics and takes actions in a single interactive session. - Health-assessment now includes P2 **Woodpecker agent unhealthy** classification - (container not running, ≥3 gRPC errors/20m, or ≥3 fast-failure pipelines/15m); - decide-actions documents the pre-session auto-recovery path + Claude evaluates all metrics and takes actions in a single interactive session - `$OPS_REPO_ROOT/knowledge/*.md` — Domain-specific remediation guides (memory, disk, CI, git, dev-agent, review-agent, forge) diff --git a/tests/disinto-init-nomad.bats b/tests/disinto-init-nomad.bats index 54c3655..21f4303 100644 --- a/tests/disinto-init-nomad.bats +++ b/tests/disinto-init-nomad.bats @@ -215,44 +215,7 @@ setup_file() { run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with unknown-service --dry-run [ "$status" -ne 0 ] [[ "$output" == *"unknown service"* ]] - [[ "$output" == *"known: forgejo, woodpecker-server, woodpecker-agent, agents, staging, chat, edge"* ]] -} - -# S3.4: woodpecker auto-expansion and forgejo auto-inclusion -@test "disinto init --backend=nomad --with woodpecker auto-expands to server+agent" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent"* ]] - [[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent"* ]] -} - -@test "disinto init --backend=nomad --with woodpecker auto-includes forgejo with note" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"Note: --with woodpecker implies --with forgejo"* ]] -} - -@test "disinto init --backend=nomad --with forgejo,woodpecker expands woodpecker" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,woodpecker --dry-run - [ "$status" -eq 0 ] - # Order follows input: forgejo first, then woodpecker expanded - [[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent"* ]] - [[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent"* ]] -} - -@test "disinto init --backend=nomad --with woodpecker seeds both forgejo and woodpecker" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"tools/vault-seed-forgejo.sh --dry-run"* ]] - [[ "$output" == *"tools/vault-seed-woodpecker.sh --dry-run"* ]] -} - -@test "disinto init --backend=nomad --with forgejo,woodpecker deploys all three services" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,woodpecker --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"*"forgejo.hcl"* ]] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-server.hcl"* ]] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-agent.hcl"* ]] + [[ "$output" == *"known: forgejo"* ]] } @test "disinto init --backend=nomad --with forgejo (flag=value syntax) works" { @@ -385,60 +348,3 @@ setup_file() { [ "$status" -ne 0 ] [[ "$output" == *"--empty and --import-env/--import-sops/--age-key are mutually exclusive"* ]] } - -# S4.2: agents service auto-expansion and dependencies -@test "disinto init --backend=nomad --with agents auto-includes forgejo and woodpecker" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with agents --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"services to deploy: forgejo,agents,woodpecker-server,woodpecker-agent"* ]] - [[ "$output" == *"Note: --with agents implies --with forgejo"* ]] - [[ "$output" == *"Note: --with agents implies --with woodpecker"* ]] -} - -@test "disinto init --backend=nomad --with agents deploys in correct order" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with agents --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents"* ]] -} - -@test "disinto init --backend=nomad --with agents seeds agents service" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with agents --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"tools/vault-seed-forgejo.sh --dry-run"* ]] - [[ "$output" == *"tools/vault-seed-woodpecker.sh --dry-run"* ]] - [[ "$output" == *"tools/vault-seed-agents.sh --dry-run"* ]] -} - -@test "disinto init --backend=nomad --with agents deploys all four services" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with agents --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"*"forgejo.hcl"* ]] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-server.hcl"* ]] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-agent.hcl"* ]] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"*"agents.hcl"* ]] -} - -@test "disinto init --backend=nomad --with woodpecker,agents expands correctly" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker,agents --dry-run - [ "$status" -eq 0 ] - # woodpecker expands to server+agent, agents is already explicit - # forgejo is auto-included by agents - [[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent,agents"* ]] - [[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents"* ]] -} - -# S5.1 / #1035 — edge service seeds ops-repo (dispatcher FORGE_TOKEN) -@test "disinto init --backend=nomad --with edge deploys edge" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with edge --dry-run - [ "$status" -eq 0 ] - # edge depends on all backend services, so all are included - [[ "$output" == *"services to deploy: edge,forgejo"* ]] - [[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents staging chat edge"* ]] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"*"edge.hcl"* ]] -} - -@test "disinto init --backend=nomad --with edge seeds ops-repo" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with edge --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"tools/vault-seed-ops-repo.sh --dry-run"* ]] -} diff --git a/tests/vault-import.bats b/tests/vault-import.bats index e59e92e..890a900 100644 --- a/tests/vault-import.bats +++ b/tests/vault-import.bats @@ -137,7 +137,6 @@ setup() { "${VAULT_ADDR}/v1/kv/data/disinto/shared/woodpecker" [ "$status" -eq 0 ] echo "$output" | grep -q "wp-agent-secret" - # Forgejo keys are normalized: WP_FORGEJO_* → forgejo_* (no wp_ prefix in key name) echo "$output" | grep -q "wp-forgejo-client" echo "$output" | grep -q "wp-forgejo-secret" echo "$output" | grep -q "wp-token" @@ -295,8 +294,6 @@ setup() { "deploy-key-test" "npm-test-token" "dockerhub-test-token" - # Note: forgejo-client and forgejo-secret are NOT in the output - # because they are read from Vault, not logged ) for pattern in "${secret_patterns[@]}"; do diff --git a/tools/vault-import.sh b/tools/vault-import.sh index dd1b73a..f85dd16 100755 --- a/tools/vault-import.sh +++ b/tools/vault-import.sh @@ -391,13 +391,7 @@ EOF local val="${!key}" if [ -n "$val" ]; then local lowercase_key="${key,,}" - # Normalize WP_FORGEJO_* → forgejo_* (strip wp_ prefix to match template) - if [[ "$lowercase_key" =~ ^wp_(.+)$ ]]; then - vault_key="${BASH_REMATCH[1]}" - else - vault_key="$lowercase_key" - fi - operations+=("woodpecker|$vault_key|$env_file|$key") + operations+=("woodpecker|$lowercase_key|$env_file|$key") fi done diff --git a/tools/vault-seed-agents.sh b/tools/vault-seed-agents.sh deleted file mode 100755 index fbed325..0000000 --- a/tools/vault-seed-agents.sh +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# tools/vault-seed-agents.sh — Idempotent seed for all bot KV paths -# -# Part of the Nomad+Vault migration (S4.1, issue #955). Populates -# kv/disinto/bots/ with token + pass for each of the 7 agent roles -# plus the vault bot. Handles the "fresh factory, no .env import" case. -# -# Companion to tools/vault-import.sh — when that runs against a box with -# an existing stack, it overwrites seeded values with real ones. -# -# Idempotency contract (per bot): -# - Both token and pass present → skip, log " unchanged". -# - Either missing → generate random values for missing keys, preserve -# existing keys, write back atomically. -# -# Preconditions: -# - Vault reachable + unsealed at $VAULT_ADDR. -# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable. -# - curl, jq, openssl -# -# Usage: -# tools/vault-seed-agents.sh -# tools/vault-seed-agents.sh --dry-run -# -# Exit codes: -# 0 success (seed applied, or already applied) -# 1 precondition / API / mount-mismatch failure -# ============================================================================= -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" - -# shellcheck source=../lib/hvault.sh -source "${REPO_ROOT}/lib/hvault.sh" - -KV_MOUNT="kv" -TOKEN_BYTES=32 # 32 bytes → 64 hex chars -PASS_BYTES=16 # 16 bytes → 32 hex chars - -# All bot roles seeded by this script. -BOT_ROLES=(dev review gardener architect planner predictor supervisor vault) - -LOG_TAG="[vault-seed-agents]" -log() { printf '%s %s\n' "$LOG_TAG" "$*"; } -die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; } - -# ── Flag parsing ───────────────────────────────────────────────────────────── -# while/shift shape — distinct from forgejo (arity:value case) and -# woodpecker (for-loop). -DRY_RUN=0 -while [ $# -gt 0 ]; do - case "$1" in - --dry-run) DRY_RUN=1 ;; - -h|--help) - printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" - printf 'Seed kv/disinto/bots/ with token + pass for all agent\n' - printf 'roles. Idempotent: existing non-empty values are preserved.\n\n' - printf ' --dry-run Print planned actions without writing.\n' - exit 0 - ;; - *) die "invalid argument: ${1} (try --help)" ;; - esac - shift -done - -# ── Preconditions ──────────────────────────────────────────────────────────── -for bin in curl jq openssl; do - command -v "$bin" >/dev/null 2>&1 \ - || die "required binary not found: ${bin}" -done -[ -n "${VAULT_ADDR:-}" ] \ - || die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200" -hvault_token_lookup >/dev/null \ - || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" - -# ── Step 1: ensure kv/ mount exists and is KV v2 ──────────────────────────── -log "── Step 1: ensure ${KV_MOUNT}/ is KV v2 ──" -export DRY_RUN -hvault_ensure_kv_v2 "$KV_MOUNT" "${LOG_TAG}" \ - || die "KV mount check failed" - -# ── Step 2: seed each bot role ─────────────────────────────────────────────── -total_generated=0 - -# Check if shared forge credentials exist for dev role fallback -shared_forge_exists=0 -shared_forge_raw="$(hvault_get_or_empty "${KV_MOUNT}/data/disinto/shared/forge")" \ - || true -if [ -n "$shared_forge_raw" ]; then - shared_forge_token="$(printf '%s' "$shared_forge_raw" | jq -r '.data.data.token // ""')" - shared_forge_pass="$(printf '%s' "$shared_forge_raw" | jq -r '.data.data.pass // ""')" - if [ -n "$shared_forge_token" ] && [ -n "$shared_forge_pass" ]; then - shared_forge_exists=1 - fi -fi - -for role in "${BOT_ROLES[@]}"; do - kv_logical="disinto/bots/${role}" - kv_api="${KV_MOUNT}/data/${kv_logical}" - - log "── seed ${kv_logical} ──" - - existing_raw="$(hvault_get_or_empty "${kv_api}")" \ - || die "failed to read ${kv_api}" - - existing_token="" - existing_pass="" - existing_data="{}" - if [ -n "$existing_raw" ]; then - existing_data="$(printf '%s' "$existing_raw" | jq '.data.data // {}')" - existing_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.token // ""')" - existing_pass="$(printf '%s' "$existing_raw" | jq -r '.data.data.pass // ""')" - fi - - generated=() - desired_token="$existing_token" - desired_pass="$existing_pass" - - # Special case: dev role uses shared forge credentials if available - if [ "$role" = "dev" ] && [ "$shared_forge_exists" -eq 1 ]; then - # Use shared FORGE_TOKEN + FORGE_PASS for dev role - if [ -z "$existing_token" ]; then - desired_token="$shared_forge_token" - generated+=("token") - fi - if [ -z "$existing_pass" ]; then - desired_pass="$shared_forge_pass" - generated+=("pass") - fi - else - # Generate random values for missing keys - if [ -z "$existing_token" ]; then - generated+=("token") - fi - if [ -z "$existing_pass" ]; then - generated+=("pass") - fi - - for key in "${generated[@]}"; do - case "$key" in - token) desired_token="$(openssl rand -hex "$TOKEN_BYTES")" ;; - pass) desired_pass="$(openssl rand -hex "$PASS_BYTES")" ;; - esac - done - fi - - if [ "${#generated[@]}" -eq 0 ]; then - log "${role}: unchanged" - continue - fi - - if [ "$DRY_RUN" -eq 1 ]; then - log "[dry-run] ${role}: would generate ${generated[*]}" - total_generated=$(( total_generated + ${#generated[@]} )) - continue - fi - - # Merge new keys into existing data to preserve any keys we don't own. - payload="$(printf '%s' "$existing_data" \ - | jq --arg t "$desired_token" --arg p "$desired_pass" \ - '{data: (. + {token: $t, pass: $p})}')" - - _hvault_request POST "${kv_api}" "$payload" >/dev/null \ - || die "failed to write ${kv_api}" - - log "${role}: generated ${generated[*]}" - total_generated=$(( total_generated + ${#generated[@]} )) -done - -if [ "$total_generated" -eq 0 ]; then - log "all bot paths already seeded — no-op" -else - log "done — ${total_generated} key(s) seeded across ${#BOT_ROLES[@]} bot paths" -fi diff --git a/tools/vault-seed-chat.sh b/tools/vault-seed-chat.sh deleted file mode 100755 index 08e3837..0000000 --- a/tools/vault-seed-chat.sh +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# tools/vault-seed-chat.sh — Idempotent seed for kv/disinto/shared/chat -# -# Part of the Nomad+Vault migration (S5.2, issue #989). Populates the KV v2 -# path that nomad/jobs/chat.hcl reads from, so a clean-install factory -# (no old-stack secrets to import) still has per-key values for -# CHAT_OAUTH_CLIENT_ID, CHAT_OAUTH_CLIENT_SECRET, and FORWARD_AUTH_SECRET. -# -# Companion to tools/vault-import.sh (S2.2) — when that import runs against -# a box with an existing stack, it overwrites these seeded values with the -# real ones. Order doesn't matter: whichever runs last wins, and both -# scripts are idempotent in the sense that re-running never rotates an -# existing non-empty key. -# -# Uses _hvault_seed_key (lib/hvault.sh) for each key — the helper reads -# existing data and merges to preserve sibling keys (KV v2 replaces .data -# atomically). -# -# Preconditions: -# - Vault reachable + unsealed at $VAULT_ADDR. -# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable. -# - The `kv/` mount is enabled as KV v2. -# -# Requires: VAULT_ADDR, VAULT_TOKEN, curl, jq, openssl -# -# Usage: -# tools/vault-seed-chat.sh -# tools/vault-seed-chat.sh --dry-run -# -# Exit codes: -# 0 success (seed applied, or already applied) -# 1 precondition / API / mount-mismatch failure -# ============================================================================= -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" - -# shellcheck source=../lib/hvault.sh -source "${REPO_ROOT}/lib/hvault.sh" - -KV_MOUNT="kv" -KV_LOGICAL_PATH="disinto/shared/chat" - -# Keys to seed — array-driven loop (structurally distinct from forgejo's -# sequential if-blocks and agents' role loop). -SEED_KEYS=(chat_oauth_client_id chat_oauth_client_secret forward_auth_secret) - -LOG_TAG="[vault-seed-chat]" -log() { printf '%s %s\n' "$LOG_TAG" "$*"; } -die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; } - -# ── Flag parsing — [[ ]] guard + case: shape distinct from forgejo -# (arity:value case), woodpecker (for-loop), agents (while/shift). -DRY_RUN=0 -if [[ $# -gt 0 ]]; then - case "$1" in - --dry-run) DRY_RUN=1 ;; - -h|--help) - printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" - printf 'Seed kv/disinto/shared/chat with random OAuth client\n' - printf 'credentials and forward auth secret if missing.\n' - printf 'Idempotent: existing non-empty values are preserved.\n\n' - printf ' --dry-run Show what would be seeded without writing.\n' - exit 0 - ;; - *) die "invalid argument: ${1} (try --help)" ;; - esac -fi - -# ── Preconditions — inline check-or-die (shape distinct from agents' array -# loop and forgejo's continuation-line style) ───────────────────────────── -command -v curl >/dev/null 2>&1 || die "curl not found" -command -v jq >/dev/null 2>&1 || die "jq not found" -command -v openssl >/dev/null 2>&1 || die "openssl not found" -[ -n "${VAULT_ADDR:-}" ] || die "VAULT_ADDR unset — export VAULT_ADDR=http://127.0.0.1:8200" -hvault_token_lookup >/dev/null || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" - -# ── Step 1/2: ensure kv/ mount exists and is KV v2 ─────────────────────────── -log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──" -export DRY_RUN -hvault_ensure_kv_v2 "$KV_MOUNT" "${LOG_TAG}" \ - || die "KV mount check failed" - -# ── Step 2/2: seed missing keys via _hvault_seed_key helper ────────────────── -log "── Step 2/2: seed ${KV_LOGICAL_PATH} ──" - -generated=() -for key in "${SEED_KEYS[@]}"; do - if [ "$DRY_RUN" -eq 1 ]; then - # Check existence without writing - existing=$(hvault_kv_get "$KV_LOGICAL_PATH" "$key" 2>/dev/null) || true - if [ -z "$existing" ]; then - generated+=("$key") - log "[dry-run] ${key} would be generated" - else - log "[dry-run] ${key} unchanged" - fi - else - rc=0 - _hvault_seed_key "$KV_LOGICAL_PATH" "$key" || rc=$? - case "$rc" in - 0) generated+=("$key"); log "${key} generated" ;; - 1) log "${key} unchanged" ;; - *) die "API error seeding ${key} (rc=${rc})" ;; - esac - fi -done - -if [ "${#generated[@]}" -eq 0 ]; then - log "all keys present — no-op" -else - log "done — ${#generated[@]} key(s) seeded at kv/${KV_LOGICAL_PATH}" -fi diff --git a/tools/vault-seed-ops-repo.sh b/tools/vault-seed-ops-repo.sh deleted file mode 100755 index 09a2fba..0000000 --- a/tools/vault-seed-ops-repo.sh +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# tools/vault-seed-ops-repo.sh — Idempotent seed for kv/disinto/shared/ops-repo -# -# Part of the Nomad+Vault migration (S5.1, issue #1035). Populates the KV v2 -# path that nomad/jobs/edge.hcl dispatcher task reads from, so the edge -# proxy has FORGE_TOKEN for ops repo access. -# -# Seeds from kv/disinto/bots/vault (the vault bot credentials) — copies the -# token field to kv/disinto/shared/ops-repo. This is the "service" path that -# dispatcher uses, distinct from the "agent" path (bots/vault) used by -# agent tasks under the service-agents policy. -# -# Idempotency contract: -# - Key present with non-empty value → leave untouched, log "token unchanged". -# - Key missing or empty → copy from bots/vault, log "token copied". -# - If bots/vault is also empty → generate a random value, log "token generated". -# -# Preconditions: -# - Vault reachable + unsealed at $VAULT_ADDR. -# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable. -# - The `kv/` mount is enabled as KV v2. -# -# Requires: -# - VAULT_ADDR (e.g. http://127.0.0.1:8200) -# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh) -# - curl, jq, openssl -# -# Usage: -# tools/vault-seed-ops-repo.sh -# tools/vault-seed-ops-repo.sh --dry-run -# -# Exit codes: -# 0 success (seed applied, or already applied) -# 1 precondition / API / mount-mismatch failure -# ============================================================================= -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" - -# shellcheck source=../lib/hvault.sh -source "${REPO_ROOT}/lib/hvault.sh" - -# KV v2 mount + logical paths -KV_MOUNT="kv" -OPS_REPO_PATH="disinto/shared/ops-repo" -VAULT_BOT_PATH="disinto/bots/vault" - -OPS_REPO_API="${KV_MOUNT}/data/${OPS_REPO_PATH}" -VAULT_BOT_API="${KV_MOUNT}/data/${VAULT_BOT_PATH}" - -log() { printf '[vault-seed-ops-repo] %s\n' "$*"; } -die() { printf '[vault-seed-ops-repo] ERROR: %s\n' "$*" >&2; exit 1; } - -# ── Flag parsing ───────────────────────────────────────────────────────────── -DRY_RUN=0 -case "$#:${1-}" in - 0:) - ;; - 1:--dry-run) - DRY_RUN=1 - ;; - 1:-h|1:--help) - printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" - printf 'Seed kv/disinto/shared/ops-repo with FORGE_TOKEN.\n\n' - printf 'Copies token from kv/disinto/bots/vault if present;\n' - printf 'otherwise generates a random value. Idempotent:\n' - printf 'existing non-empty values are left untouched.\n\n' - printf ' --dry-run Print planned actions without writing.\n' - exit 0 - ;; - *) - die "invalid arguments: $* (try --help)" - ;; -esac - -# ── Preconditions ──────────────────────────────────────────────────────────── -for bin in curl jq openssl; do - command -v "$bin" >/dev/null 2>&1 \ - || die "required binary not found: ${bin}" -done - -[ -n "${VAULT_ADDR:-}" ] \ - || die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200" -hvault_token_lookup >/dev/null \ - || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" - -# ── Step 1/2: ensure kv/ mount exists and is KV v2 ─────────────────────────── -log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──" -export DRY_RUN -hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-ops-repo]" \ - || die "KV mount check failed" - -# ── Step 2/2: seed ops-repo from vault bot ─────────────────────────────────── -log "── Step 2/2: seed ${OPS_REPO_API} ──" - -# Read existing ops-repo value -existing_raw="$(hvault_get_or_empty "${OPS_REPO_API}")" \ - || die "failed to read ${OPS_REPO_API}" - -existing_token="" -if [ -n "$existing_raw" ]; then - existing_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.token // ""')" -fi - -desired_token="$existing_token" -action="" - -if [ -z "$existing_token" ]; then - # Token missing — try to copy from vault bot - bot_raw="$(hvault_get_or_empty "${VAULT_BOT_API}")" || true - if [ -n "$bot_raw" ]; then - bot_token="$(printf '%s' "$bot_raw" | jq -r '.data.data.token // ""')" - if [ -n "$bot_token" ]; then - desired_token="$bot_token" - action="copied" - fi - fi - - # If still no token, generate one - if [ -z "$desired_token" ]; then - if [ "$DRY_RUN" -eq 1 ]; then - action="generated (dry-run)" - else - desired_token="$(openssl rand -hex 32)" - action="generated" - fi - fi -fi - -if [ -z "$action" ]; then - log "all keys present at ${OPS_REPO_API} — no-op" - log "token unchanged" - exit 0 -fi - -if [ "$DRY_RUN" -eq 1 ]; then - log "[dry-run] ${OPS_REPO_PATH}: would ${action} token" - exit 0 -fi - -# Write the token -payload="$(jq -n --arg t "$desired_token" '{data: {token: $t}}')" -_hvault_request POST "${OPS_REPO_API}" "$payload" >/dev/null \ - || die "failed to write ${OPS_REPO_API}" - -log "${OPS_REPO_PATH}: ${action} token" -log "done — ${OPS_REPO_API} seeded" diff --git a/vault/policies/AGENTS.md b/vault/policies/AGENTS.md index 9a4b588..692c885 100644 --- a/vault/policies/AGENTS.md +++ b/vault/policies/AGENTS.md @@ -1,4 +1,4 @@ - + # vault/policies/ — Agent Instructions HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per @@ -30,9 +30,6 @@ KV v2). Vault addresses KV v2 data at `kv/data/` and metadata at |---|---| | `service-forgejo` | `kv/data/disinto/shared/forgejo/*` | | `service-woodpecker` | `kv/data/disinto/shared/woodpecker/*` | -| `service-agents` | All 7 `kv/data/disinto/bots//*` namespaces + `kv/data/disinto/shared/forge/*`; composite policy for the `agents` Nomad job (S4.1) | -| `service-chat` | `kv/data/disinto/shared/chat/*`; read-only OAuth client config + forward-auth secret for the chat Nomad job (S5.2, #989) | -| `service-dispatcher` | `kv/data/disinto/runner/*` (list+read) + `kv/data/disinto/shared/ops-repo/*` (read); used by edge dispatcher sidecar (S5.1, #988) | | `bot-` (dev, review, gardener, architect, planner, predictor, supervisor, vault, dev-qwen) | `kv/data/disinto/bots//*` + `kv/data/disinto/shared/forge/*` | | `runner-` (GITHUB\_TOKEN, CODEBERG\_TOKEN, CLAWHUB\_TOKEN, DEPLOY\_KEY, NPM\_TOKEN, DOCKER\_HUB\_TOKEN) | `kv/data/disinto/runner/` (exactly one) | | `dispatcher` | `kv/data/disinto/runner/*` + `kv/data/disinto/shared/ops-repo/*` | diff --git a/vault/policies/bot-architect.hcl b/vault/policies/bot-architect.hcl index 9f84de1..9381b61 100644 --- a/vault/policies/bot-architect.hcl +++ b/vault/policies/bot-architect.hcl @@ -3,14 +3,14 @@ # Architect agent: reads its own bot KV namespace + the shared forge URL. # Attached to the architect-agent Nomad job via workload identity (S2.4). -path "kv/data/disinto/bots/architect" { +path "kv/data/disinto/bots/architect/*" { capabilities = ["read"] } -path "kv/metadata/disinto/bots/architect" { +path "kv/metadata/disinto/bots/architect/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/forge" { +path "kv/data/disinto/shared/forge/*" { capabilities = ["read"] } diff --git a/vault/policies/bot-dev-qwen.hcl b/vault/policies/bot-dev-qwen.hcl index 50f2d2d..b71283d 100644 --- a/vault/policies/bot-dev-qwen.hcl +++ b/vault/policies/bot-dev-qwen.hcl @@ -5,14 +5,14 @@ # via workload identity (S2.4). KV path mirrors the bot basename: # kv/disinto/bots/dev-qwen/*. -path "kv/data/disinto/bots/dev-qwen" { +path "kv/data/disinto/bots/dev-qwen/*" { capabilities = ["read"] } -path "kv/metadata/disinto/bots/dev-qwen" { +path "kv/metadata/disinto/bots/dev-qwen/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/forge" { +path "kv/data/disinto/shared/forge/*" { capabilities = ["read"] } diff --git a/vault/policies/bot-dev.hcl b/vault/policies/bot-dev.hcl index 35cf6de..3771288 100644 --- a/vault/policies/bot-dev.hcl +++ b/vault/policies/bot-dev.hcl @@ -3,14 +3,14 @@ # Dev agent: reads its own bot KV namespace + the shared forge URL. # Attached to the dev-agent Nomad job via workload identity (S2.4). -path "kv/data/disinto/bots/dev" { +path "kv/data/disinto/bots/dev/*" { capabilities = ["read"] } -path "kv/metadata/disinto/bots/dev" { +path "kv/metadata/disinto/bots/dev/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/forge" { +path "kv/data/disinto/shared/forge/*" { capabilities = ["read"] } diff --git a/vault/policies/bot-gardener.hcl b/vault/policies/bot-gardener.hcl index ed45431..f5ef230 100644 --- a/vault/policies/bot-gardener.hcl +++ b/vault/policies/bot-gardener.hcl @@ -3,14 +3,14 @@ # Gardener agent: reads its own bot KV namespace + the shared forge URL. # Attached to the gardener-agent Nomad job via workload identity (S2.4). -path "kv/data/disinto/bots/gardener" { +path "kv/data/disinto/bots/gardener/*" { capabilities = ["read"] } -path "kv/metadata/disinto/bots/gardener" { +path "kv/metadata/disinto/bots/gardener/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/forge" { +path "kv/data/disinto/shared/forge/*" { capabilities = ["read"] } diff --git a/vault/policies/bot-planner.hcl b/vault/policies/bot-planner.hcl index ae3e910..440f6aa 100644 --- a/vault/policies/bot-planner.hcl +++ b/vault/policies/bot-planner.hcl @@ -3,14 +3,14 @@ # Planner agent: reads its own bot KV namespace + the shared forge URL. # Attached to the planner-agent Nomad job via workload identity (S2.4). -path "kv/data/disinto/bots/planner" { +path "kv/data/disinto/bots/planner/*" { capabilities = ["read"] } -path "kv/metadata/disinto/bots/planner" { +path "kv/metadata/disinto/bots/planner/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/forge" { +path "kv/data/disinto/shared/forge/*" { capabilities = ["read"] } diff --git a/vault/policies/bot-predictor.hcl b/vault/policies/bot-predictor.hcl index 7159d72..3a3b6b2 100644 --- a/vault/policies/bot-predictor.hcl +++ b/vault/policies/bot-predictor.hcl @@ -3,14 +3,14 @@ # Predictor agent: reads its own bot KV namespace + the shared forge URL. # Attached to the predictor-agent Nomad job via workload identity (S2.4). -path "kv/data/disinto/bots/predictor" { +path "kv/data/disinto/bots/predictor/*" { capabilities = ["read"] } -path "kv/metadata/disinto/bots/predictor" { +path "kv/metadata/disinto/bots/predictor/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/forge" { +path "kv/data/disinto/shared/forge/*" { capabilities = ["read"] } diff --git a/vault/policies/bot-review.hcl b/vault/policies/bot-review.hcl index f0ddfe4..04c7668 100644 --- a/vault/policies/bot-review.hcl +++ b/vault/policies/bot-review.hcl @@ -3,14 +3,14 @@ # Review agent: reads its own bot KV namespace + the shared forge URL. # Attached to the review-agent Nomad job via workload identity (S2.4). -path "kv/data/disinto/bots/review" { +path "kv/data/disinto/bots/review/*" { capabilities = ["read"] } -path "kv/metadata/disinto/bots/review" { +path "kv/metadata/disinto/bots/review/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/forge" { +path "kv/data/disinto/shared/forge/*" { capabilities = ["read"] } diff --git a/vault/policies/bot-supervisor.hcl b/vault/policies/bot-supervisor.hcl index 4d7f1e2..36ecc90 100644 --- a/vault/policies/bot-supervisor.hcl +++ b/vault/policies/bot-supervisor.hcl @@ -3,14 +3,14 @@ # Supervisor agent: reads its own bot KV namespace + the shared forge URL. # Attached to the supervisor-agent Nomad job via workload identity (S2.4). -path "kv/data/disinto/bots/supervisor" { +path "kv/data/disinto/bots/supervisor/*" { capabilities = ["read"] } -path "kv/metadata/disinto/bots/supervisor" { +path "kv/metadata/disinto/bots/supervisor/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/forge" { +path "kv/data/disinto/shared/forge/*" { capabilities = ["read"] } diff --git a/vault/policies/bot-vault.hcl b/vault/policies/bot-vault.hcl index d2f9fe4..0a088dd 100644 --- a/vault/policies/bot-vault.hcl +++ b/vault/policies/bot-vault.hcl @@ -7,14 +7,14 @@ # NOTE: distinct from the runner-* policies, which gate per-secret access # for vault-runner ephemeral dispatches (Step 5). -path "kv/data/disinto/bots/vault" { +path "kv/data/disinto/bots/vault/*" { capabilities = ["read"] } -path "kv/metadata/disinto/bots/vault" { +path "kv/metadata/disinto/bots/vault/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/forge" { +path "kv/data/disinto/shared/forge/*" { capabilities = ["read"] } diff --git a/vault/policies/dispatcher.hcl b/vault/policies/dispatcher.hcl index a18f1ab..6383ae7 100644 --- a/vault/policies/dispatcher.hcl +++ b/vault/policies/dispatcher.hcl @@ -20,10 +20,10 @@ path "kv/metadata/disinto/runner/*" { capabilities = ["list", "read"] } -path "kv/data/disinto/shared/ops-repo" { +path "kv/data/disinto/shared/ops-repo/*" { capabilities = ["read"] } -path "kv/metadata/disinto/shared/ops-repo" { +path "kv/metadata/disinto/shared/ops-repo/*" { capabilities = ["list", "read"] } diff --git a/vault/policies/service-agents.hcl b/vault/policies/service-agents.hcl deleted file mode 100644 index 4c65a13..0000000 --- a/vault/policies/service-agents.hcl +++ /dev/null @@ -1,76 +0,0 @@ -# vault/policies/service-agents.hcl -# -# Composite policy for the `agents` Nomad job (S4.1, issue #955). -# Grants read access to all 7 bot KV namespaces + shared forge config, -# so a single job running all agent roles can pull per-bot tokens from -# Vault via workload identity. - -# ── Per-bot KV paths (token + pass per role) ───────────────────────────────── -path "kv/data/disinto/bots/dev" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/bots/dev" { - capabilities = ["list", "read"] -} - -path "kv/data/disinto/bots/review" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/bots/review" { - capabilities = ["list", "read"] -} - -path "kv/data/disinto/bots/gardener" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/bots/gardener" { - capabilities = ["list", "read"] -} - -path "kv/data/disinto/bots/architect" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/bots/architect" { - capabilities = ["list", "read"] -} - -path "kv/data/disinto/bots/planner" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/bots/planner" { - capabilities = ["list", "read"] -} - -path "kv/data/disinto/bots/predictor" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/bots/predictor" { - capabilities = ["list", "read"] -} - -path "kv/data/disinto/bots/supervisor" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/bots/supervisor" { - capabilities = ["list", "read"] -} - -path "kv/data/disinto/bots/vault" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/bots/vault" { - capabilities = ["list", "read"] -} - -# ── Shared forge config (URL, bot usernames) ───────────────────────────────── -path "kv/data/disinto/shared/forge" { - capabilities = ["read"] -} diff --git a/vault/policies/service-chat.hcl b/vault/policies/service-chat.hcl deleted file mode 100644 index a021006..0000000 --- a/vault/policies/service-chat.hcl +++ /dev/null @@ -1,15 +0,0 @@ -# vault/policies/service-chat.hcl -# -# Read-only access to shared Chat secrets (OAuth client config, forward auth -# secret). Attached to the Chat Nomad job via workload identity (S5.2). -# -# Scope: kv/disinto/shared/chat — entries owned by the operator and -# shared between the chat service and edge proxy. - -path "kv/data/disinto/shared/chat" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/shared/chat" { - capabilities = ["list", "read"] -} diff --git a/vault/policies/service-dispatcher.hcl b/vault/policies/service-dispatcher.hcl deleted file mode 100644 index bdc7ddb..0000000 --- a/vault/policies/service-dispatcher.hcl +++ /dev/null @@ -1,29 +0,0 @@ -# vault/policies/service-dispatcher.hcl -# -# Edge dispatcher policy: needs to enumerate the runner secret namespace -# (to check secret presence before dispatching) and read the shared -# ops-repo credentials (token + clone URL) it uses to fetch action TOMLs. -# -# Scope: -# - kv/disinto/runner/* — read all per-secret values + list keys -# - kv/disinto/shared/ops-repo/* — read the ops-repo creds bundle -# -# The actual ephemeral runner container created per dispatch gets the -# narrow runner- policies, NOT this one. This policy stays bound -# to the long-running dispatcher only. - -path "kv/data/disinto/runner/*" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/runner/*" { - capabilities = ["list", "read"] -} - -path "kv/data/disinto/shared/ops-repo" { - capabilities = ["read"] -} - -path "kv/metadata/disinto/shared/ops-repo" { - capabilities = ["list", "read"] -} diff --git a/vault/policies/service-woodpecker.hcl b/vault/policies/service-woodpecker.hcl index 34b3795..19c9726 100644 --- a/vault/policies/service-woodpecker.hcl +++ b/vault/policies/service-woodpecker.hcl @@ -6,10 +6,10 @@ # Scope: kv/disinto/shared/woodpecker/* — entries owned by the operator # and consumed by woodpecker-server + woodpecker-agent. -path "kv/data/disinto/shared/woodpecker" { +path "kv/data/disinto/shared/woodpecker/*" { capabilities = ["read"] } -path "kv/metadata/disinto/shared/woodpecker" { +path "kv/metadata/disinto/shared/woodpecker/*" { capabilities = ["list", "read"] } diff --git a/vault/roles.yaml b/vault/roles.yaml index c058a30..2109504 100644 --- a/vault/roles.yaml +++ b/vault/roles.yaml @@ -62,21 +62,6 @@ roles: namespace: default job_id: woodpecker-agent - # ── Agents composite (nomad/jobs/agents.hcl — S4.1) ────────────────────── - # Single job running all 7 agent roles. Uses a composite policy - # (vault/policies/service-agents.hcl) that unions all bot KV paths. - - name: service-agents - policy: service-agents - namespace: default - job_id: agents - - # ── Chat UI (nomad/jobs/chat.hcl — S5.2) ───────────────────────────────── - # Claude chat UI service with OAuth secrets. Uses vault/policies/service-chat.hcl. - - name: service-chat - policy: service-chat - namespace: default - job_id: chat - # ── Per-agent bots (nomad/jobs/bot-.hcl — land in later steps) ─────── # job_id placeholders match the policy name 1:1 until each bot's jobspec # lands. When a bot's jobspec is added under nomad/jobs/, update the @@ -128,10 +113,10 @@ roles: job_id: bot-vault # ── Edge dispatcher ──────────────────────────────────────────────────────── - - name: service-dispatcher - policy: service-dispatcher + - name: dispatcher + policy: dispatcher namespace: default - job_id: edge + job_id: dispatcher # ── Per-secret runner roles ──────────────────────────────────────────────── # vault-runner (Step 5) composes runner- policies onto each