diff --git a/.woodpecker/nomad-validate.yml b/.woodpecker/nomad-validate.yml index 81e45ae..d5828e9 100644 --- a/.woodpecker/nomad-validate.yml +++ b/.woodpecker/nomad-validate.yml @@ -16,7 +16,7 @@ # Steps (all fail-closed — any error blocks merge): # 1. nomad-config-validate — `nomad config validate` on server + client HCL # 2. nomad-job-validate — `nomad job validate` looped over every -# nomad/jobs/*.hcl (new jobspecs get +# nomad/jobs/*.nomad.hcl (new jobspecs get # CI coverage automatically) # 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl # 4. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto @@ -57,7 +57,6 @@ steps: - name: nomad-config-validate image: hashicorp/nomad:1.9.5 commands: - - nomad version - nomad config validate nomad/server.hcl nomad/client.hcl # ── 2. Nomad jobspec HCL syntax check ──────────────────────────────────── @@ -69,15 +68,15 @@ steps: # # Validation is offline: no running Nomad server is required (exit 0 on # valid HCL, 1 on syntax/semantic error). The CLI takes a single path - # argument so we loop over every `*.hcl` file under nomad/jobs/ — + # argument so we loop over every `*.nomad.hcl` file under nomad/jobs/ — # that way a new jobspec PR gets CI coverage automatically (no separate - # "edit the pipeline" step to forget). The `.hcl` suffix is the naming - # convention: anything else in nomad/jobs/ is deliberately not validated - # by this step. + # "edit the pipeline" step to forget). The `.nomad.hcl` suffix is the + # naming convention documented in nomad/AGENTS.md; anything else in + # nomad/jobs/ is deliberately not validated by this step. # # `[ -f "$f" ]` guards against the no-match case: POSIX sh does not # nullglob, so an empty jobs/ directory would leave the literal glob in - # "$f" and fail. Today forgejo.hcl exists, but the guard keeps the + # "$f" and fail. Today forgejo.nomad.hcl exists, but the guard keeps the # step safe during any future transient empty state. # # Scope note: offline validate catches jobspec-level errors (unknown @@ -92,7 +91,7 @@ steps: commands: - | set -e - for f in nomad/jobs/*.hcl; do + for f in nomad/jobs/*.nomad.hcl; do [ -f "$f" ] || continue echo "validating jobspec: $f" nomad job validate "$f" diff --git a/bin/disinto b/bin/disinto index 6128b7c..dc56f39 100755 --- a/bin/disinto +++ b/bin/disinto @@ -82,7 +82,6 @@ Init options: --ci-id Woodpecker CI repo ID (default: 0 = no CI) --forge-url Forge base URL (default: http://localhost:3000) --backend Orchestration backend: docker (default) | nomad - --with (nomad) Deploy services: forgejo[,...] (S1.3) --empty (nomad) Bring up cluster only, no jobs (S0.4) --bare Skip compose generation (bare-metal setup) --build Use local docker build instead of registry images (dev mode) @@ -663,20 +662,14 @@ prompt_admin_password() { # init run); operators running without sudo-NOPASSWD should invoke # `sudo disinto init ...` directly. _disinto_init_nomad() { - local dry_run="${1:-false}" empty="${2:-false}" with_services="${3:-}" + local dry_run="${1:-false}" empty="${2:-false}" local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh" - local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh" if [ ! -x "$cluster_up" ]; then echo "Error: ${cluster_up} not found or not executable" >&2 exit 1 fi - if [ -n "$with_services" ] && [ ! -x "$deploy_sh" ]; then - echo "Error: ${deploy_sh} not found or not executable" >&2 - exit 1 - fi - # --empty and default both invoke cluster-up today. Log the requested # mode so the dispatch is visible in factory bootstrap logs — Step 1 # will branch on $empty to gate the job-deployment path. @@ -686,105 +679,31 @@ _disinto_init_nomad() { echo "nomad backend: default (cluster-up; jobs deferred to Step 1)" fi - # Dry-run: print cluster-up plan + deploy.sh plan + # Dry-run forwards straight through; cluster-up.sh prints its own step + # list and exits 0 without touching the box. + local -a cmd=("$cluster_up") if [ "$dry_run" = "true" ]; then - echo "" - echo "── Cluster-up dry-run ─────────────────────────────────" - local -a cmd=("$cluster_up" "--dry-run") - "${cmd[@]}" || true - echo "" - - if [ -n "$with_services" ]; then - echo "── Deploy services dry-run ────────────────────────────" - echo "[deploy] services to deploy: ${with_services}" - local IFS=',' - for svc in $with_services; do - svc=$(echo "$svc" | xargs) # trim whitespace - # Validate known services first - case "$svc" in - forgejo) ;; - *) - echo "Error: unknown service '${svc}' — known: forgejo" >&2 - exit 1 - ;; - esac - local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" - if [ ! -f "$jobspec_path" ]; then - echo "Error: jobspec not found: ${jobspec_path}" >&2 - exit 1 - fi - echo "[deploy] [dry-run] nomad job validate ${jobspec_path}" - echo "[deploy] [dry-run] nomad job run -detach ${jobspec_path}" - done - echo "[deploy] dry-run complete" - fi - exit 0 + cmd+=("--dry-run") + "${cmd[@]}" + exit $? fi - # Real run: cluster-up + deploy services - local -a cluster_cmd=("$cluster_up") + # Real run — needs root. Invoke via sudo if we're not already root so + # the command's exit code propagates directly. We don't distinguish + # "sudo denied" from "cluster-up.sh failed" here; both surface as a + # non-zero exit, and cluster-up.sh's own error messages cover the + # latter case. + local rc=0 if [ "$(id -u)" -eq 0 ]; then - "${cluster_cmd[@]}" || exit $? + "${cmd[@]}" || rc=$? else if ! command -v sudo >/dev/null 2>&1; then echo "Error: cluster-up.sh must run as root and sudo is not installed" >&2 exit 1 fi - sudo -n -- "${cluster_cmd[@]}" || exit $? + sudo -n -- "${cmd[@]}" || rc=$? fi - - # Deploy services if requested - if [ -n "$with_services" ]; then - echo "" - echo "── Deploying services ─────────────────────────────────" - local -a deploy_cmd=("$deploy_sh") - # Split comma-separated service list into positional args - local IFS=',' - for svc in $with_services; do - svc=$(echo "$svc" | xargs) # trim whitespace - if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then - echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2 - exit 1 - fi - # Validate known services FIRST (before jobspec check) - case "$svc" in - forgejo) ;; - *) - echo "Error: unknown service '${svc}' — known: forgejo" >&2 - exit 1 - ;; - esac - # Check jobspec exists - local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" - if [ ! -f "$jobspec_path" ]; then - echo "Error: jobspec not found: ${jobspec_path}" >&2 - exit 1 - fi - deploy_cmd+=("$svc") - done - - if [ "$(id -u)" -eq 0 ]; then - "${deploy_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${deploy_cmd[@]}" || exit $? - fi - - # Print final summary - echo "" - echo "── Summary ────────────────────────────────────────────" - echo "Cluster: Nomad+Vault cluster is up" - echo "Deployed: ${with_services}" - if echo "$with_services" | grep -q "forgejo"; then - echo "Ports: forgejo: 3000" - fi - echo "────────────────────────────────────────────────────────" - fi - - exit 0 + exit "$rc" } disinto_init() { @@ -802,7 +721,7 @@ disinto_init() { fi # Parse flags - local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services="" + local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false while [ $# -gt 0 ]; do case "$1" in --branch) branch="$2"; shift 2 ;; @@ -811,8 +730,6 @@ disinto_init() { --forge-url) forge_url_flag="$2"; shift 2 ;; --backend) backend="$2"; shift 2 ;; --backend=*) backend="${1#--backend=}"; shift ;; - --with) with_services="$2"; shift 2 ;; - --with=*) with_services="${1#--with=}"; shift ;; --bare) bare=true; shift ;; --build) use_build=true; shift ;; --empty) empty=true; shift ;; @@ -847,23 +764,11 @@ disinto_init() { exit 1 fi - # --with requires --backend=nomad - if [ -n "$with_services" ] && [ "$backend" != "nomad" ]; then - echo "Error: --with requires --backend=nomad" >&2 - exit 1 - fi - - # --empty and --with are mutually exclusive - if [ "$empty" = true ] && [ -n "$with_services" ]; then - echo "Error: --empty and --with are mutually exclusive" >&2 - exit 1 - fi - # Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh # (S0.4). The default and --empty variants are identical today; Step 1 # will branch on $empty to add job deployment to the default path. if [ "$backend" = "nomad" ]; then - _disinto_init_nomad "$dry_run" "$empty" "$with_services" + _disinto_init_nomad "$dry_run" "$empty" # shellcheck disable=SC2317 # _disinto_init_nomad always exits today; # `return` is defensive against future refactors. return diff --git a/docs/agents-llama.md b/docs/agents-llama.md index bc973b7..85b1758 100644 --- a/docs/agents-llama.md +++ b/docs/agents-llama.md @@ -61,27 +61,34 @@ This writes `ANTHROPIC_API_KEY` to `.env` instead of `ANTHROPIC_BASE_URL`. ## Activation and running -Once hired, the agent service is added to `docker-compose.yml`. Start the -service with `docker compose up -d`: +### Default activation (single agent) + +Once hired, the agent service is added to `docker-compose.yml` but not started +by default. To start a single agent: ```bash -# Start all agent services -docker compose up -d +# Start just the dev-qwen agent +COMPOSE_PROFILES=agents-dev-qwen docker compose up -d +``` -# Start a single named agent service -docker compose up -d agents-dev-qwen +**Important:** Local-model agent services are profile-gated. Running `docker +compose up -d` without `COMPOSE_PROFILES` will not start them, and `--remove-orphans` +may remove them as unmanaged containers. -# Start multiple named agent services -docker compose up -d agents-dev-qwen agents-planner +### Starting multiple agents + +```bash +# Start multiple agents +COMPOSE_PROFILES=agents-dev-qwen COMPOSE_PROFILES=agents-planner docker compose up -d ``` ### Stopping agents ```bash -# Stop a specific agent service -docker compose down agents-dev-qwen +# Stop specific agents +COMPOSE_PROFILES=agents-dev-qwen docker compose down -# Stop all agent services +# Stop all agents docker compose down ``` @@ -156,14 +163,15 @@ poll_interval = 60 ### Agent service not starting -Check that the service was created by `disinto hire-an-agent`: +Check that you're using `COMPOSE_PROFILES`: ```bash -docker compose config | grep -A5 "agents-dev-qwen" -``` +# Wrong: this won't start profile-gated agent services +docker compose up -d -If the service is missing, re-run `disinto hire-an-agent dev-qwen dev` to -regenerate `docker-compose.yml`. +# Correct: explicitly specify the profile +COMPOSE_PROFILES=agents-dev-qwen docker compose up -d +``` ### Model endpoint unreachable @@ -180,7 +188,7 @@ If using a custom host IP, update `ANTHROPIC_BASE_URL` in `.env`: sed -i 's|^ANTHROPIC_BASE_URL=.*|ANTHROPIC_BASE_URL=http://192.168.1.100:8081|' .env # Restart the agent -docker compose restart agents-dev-qwen +COMPOSE_PROFILES=agents-dev-qwen docker compose restart agents-dev-qwen ``` ### Invalid agent name diff --git a/nomad/AGENTS.md b/nomad/AGENTS.md index 953a7b2..d80780f 100644 --- a/nomad/AGENTS.md +++ b/nomad/AGENTS.md @@ -24,7 +24,7 @@ it owns. ## What does NOT live here yet - **Jobspecs.** Step 0 brings up an *empty* cluster. Step 1 (and later) - adds `*.hcl` job files for forgejo, woodpecker, agents, caddy, + adds `*.nomad.hcl` job files for forgejo, woodpecker, agents, caddy, etc. When that lands, jobspecs will live in `nomad/jobs/` and each will get its own header comment pointing to the `host_volume` names it consumes (`volume = "forgejo-data"`, etc. — declared in @@ -35,11 +35,11 @@ it owns. ## Adding a jobspec (Step 1 and later) -1. Drop a file in `nomad/jobs/.hcl`. The `.hcl` suffix is - load-bearing: `.woodpecker/nomad-validate.yml` globs on exactly that - suffix to auto-pick up new jobspecs (see step 2 in "How CI validates - these files" below). Anything else in `nomad/jobs/` is silently - skipped by CI. +1. Drop a file in `nomad/jobs/.nomad.hcl`. The `.nomad.hcl` + suffix is load-bearing: `.woodpecker/nomad-validate.yml` globs on + exactly that suffix to auto-pick up new jobspecs (see step 2 in + "How CI validates these files" below). Anything else in + `nomad/jobs/` is silently skipped by CI. 2. If it needs persistent state, reference a `host_volume` already declared in `client.hcl` — *don't* add ad-hoc host paths in the jobspec. If a new volume is needed, add it to **both**: @@ -52,9 +52,9 @@ it owns. rejects the mismatch at placement time instead. 3. Pin image tags — `image = "forgejo/forgejo:1.22.5"`, not `:latest`. 4. No pipeline edit required — step 2 of `nomad-validate.yml` globs - over `nomad/jobs/*.hcl` and validates every match. Just make sure - the existing `nomad/**` trigger path still covers your file (it - does for anything under `nomad/jobs/`). + over `nomad/jobs/*.nomad.hcl` and validates every match. Just make + sure the existing `nomad/**` trigger path still covers your file + (it does for anything under `nomad/jobs/`). ## How CI validates these files @@ -67,7 +67,7 @@ fail-closed steps: driver config. Vault HCL is excluded (different tool). Jobspecs are excluded too — agent-config and jobspec are disjoint HCL grammars; running this step on a jobspec rejects it with "unknown block 'job'". -2. **`nomad job validate nomad/jobs/*.hcl`** (loop, one call per file) +2. **`nomad job validate nomad/jobs/*.nomad.hcl`** (loop, one call per file) — parses each jobspec's HCL, fails on unknown stanzas, missing required fields, wrong value types, invalid driver config. Runs offline (no Nomad server needed) so CI exit 0 ≠ "this will schedule @@ -79,7 +79,7 @@ fail-closed steps: - image reachability — `image = "codeberg.org/forgejo/forgejo:11.0"` is accepted even if the registry is down or the tag is wrong. New jobspecs are picked up automatically by the glob — no pipeline - edit needed as long as the file is named `.hcl`. + edit needed as long as the file is named `.nomad.hcl`. 3. **`vault operator diagnose -config=nomad/vault.hcl -skip=storage -skip=listener`** — Vault's equivalent syntax + schema check. `-skip=storage/listener` disables the runtime checks (CI containers don't have diff --git a/nomad/jobs/forgejo.hcl b/nomad/jobs/forgejo.nomad.hcl similarity index 98% rename from nomad/jobs/forgejo.hcl rename to nomad/jobs/forgejo.nomad.hcl index b2c057f..c7a0326 100644 --- a/nomad/jobs/forgejo.hcl +++ b/nomad/jobs/forgejo.nomad.hcl @@ -1,5 +1,5 @@ # ============================================================================= -# nomad/jobs/forgejo.hcl — Forgejo git server (Nomad service job) +# nomad/jobs/forgejo.nomad.hcl — Forgejo git server (Nomad service job) # # Part of the Nomad+Vault migration (S1.1, issue #840). First jobspec to # land under nomad/jobs/ — proves the docker driver + host_volume plumbing diff --git a/tests/disinto-init-nomad.bats b/tests/disinto-init-nomad.bats index 8616e2d..5b2648b 100644 --- a/tests/disinto-init-nomad.bats +++ b/tests/disinto-init-nomad.bats @@ -143,51 +143,3 @@ setup_file() { [[ "$output" == *"repo URL required"* ]] [[ "$output" != *"Unknown option"* ]] } - -# ── --with flag tests ───────────────────────────────────────────────────────── - -@test "disinto init --backend=nomad --with forgejo --dry-run prints deploy plan" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"services to deploy: forgejo"* ]] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"* ]] - [[ "$output" == *"[deploy] [dry-run] nomad job run -detach"* ]] - [[ "$output" == *"[deploy] dry-run complete"* ]] -} - -@test "disinto init --backend=nomad --with forgejo,forgejo --dry-run handles comma-separated services" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,forgejo --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"services to deploy: forgejo,forgejo"* ]] -} - -@test "disinto init --backend=docker --with forgejo errors with '--with requires --backend=nomad'" { - run "$DISINTO_BIN" init placeholder/repo --backend=docker --with forgejo - [ "$status" -ne 0 ] - [[ "$output" == *"--with requires --backend=nomad"* ]] -} - -@test "disinto init --backend=nomad --empty --with forgejo errors with mutually exclusive" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --with forgejo - [ "$status" -ne 0 ] - [[ "$output" == *"--empty and --with are mutually exclusive"* ]] -} - -@test "disinto init --backend=nomad --with unknown-service errors with unknown service" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with unknown-service --dry-run - [ "$status" -ne 0 ] - [[ "$output" == *"unknown service"* ]] - [[ "$output" == *"known: forgejo"* ]] -} - -@test "disinto init --backend=nomad --with forgejo (flag=value syntax) works" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with=forgejo --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"services to deploy: forgejo"* ]] -} - -@test "disinto init --backend=nomad --with forgejo --empty --dry-run rejects in any order" { - run "$DISINTO_BIN" init placeholder/repo --with forgejo --backend=nomad --empty --dry-run - [ "$status" -ne 0 ] - [[ "$output" == *"--empty and --with are mutually exclusive"* ]] -}