diff --git a/.woodpecker/detect-duplicates.py b/.woodpecker/detect-duplicates.py index 9c87b1d..9b108bf 100644 --- a/.woodpecker/detect-duplicates.py +++ b/.woodpecker/detect-duplicates.py @@ -294,10 +294,6 @@ def main() -> int: "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern", # Standard lib source block shared across formula-driven agent run scripts "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)", - # Test data for duplicate service detection tests (#850) - # Intentionally duplicated TOML blocks in smoke-init.sh and test-duplicate-service-detection.sh - "334967b8b4f1a8d3b0b9b8e0912f3bfb": "Test TOML: [agents.llama] block header (smoke-init.sh + test-duplicate-service-detection.sh)", - "d82f30077e5bb23b5fc01db003033d5d": "Test TOML: [agents.llama] block body (smoke-init.sh + test-duplicate-service-detection.sh)", # Common vault-seed script patterns: logging helpers + flag parsing # Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh "843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)", @@ -312,21 +308,6 @@ def main() -> int: "63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die", "34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup", "71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die", - # Common vault-seed script flag parsing patterns - # Shared across tools/vault-seed-{forgejo,ops-repo}.sh - "6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)", - "a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)", - "e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)", - "c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)", - "106f4071e88f841b3208b01144cd1c39": "vault-seed case pattern dry-run end (forgejo + ops-repo)", - "c15506dcb6bb340b25d1c39d442dd2e6": "vault-seed help text + invalid arg handler (forgejo + ops-repo)", - "1feecd3b3caf00045fae938ddf2811de": "vault-seed invalid arg handler (forgejo + ops-repo)", - "919780d5e7182715344f5aa02b191294": "vault-seed invalid arg + esac pattern (forgejo + ops-repo)", - "8dce1d292bce8e60ef4c0665b62945b0": "vault-seed esac + binary check loop (forgejo + ops-repo)", - "ca043687143a5b47bd54e65a99ce8ee8": "vault-seed binary check loop start (forgejo + ops-repo)", - "aefd9f655411a955395e6e5995ddbe6f": "vault-seed binary check pattern (forgejo + ops-repo)", - "60f0c46deb5491599457efb4048918e5": "vault-seed VAULT_ADDR + hvault_token_lookup check (forgejo + ops-repo)", - "f6838f581ef6b4d82b55268389032769": "vault-seed VAULT_ADDR + hvault_token_lookup die (forgejo + ops-repo)", } if not sh_files: diff --git a/.woodpecker/edge-subpath.yml b/.woodpecker/edge-subpath.yml new file mode 100644 index 0000000..6e0a17e --- /dev/null +++ b/.woodpecker/edge-subpath.yml @@ -0,0 +1,55 @@ +# .woodpecker/edge-subpath.yml — Edge subpath routing smoke test +# +# Runs end-to-end smoke tests for Forgejo, Woodpecker, and chat subpath routing: +# - Forgejo at /forge/ +# - Woodpecker at /ci/ +# - Chat at /chat/ +# - Staging at /staging/ +# +# Tests: +# 1. Root / redirects to /forge/ +# 2. Forgejo login at /forge/ completes without redirect loops +# 3. Forgejo OAuth callback for Woodpecker succeeds under subpath +# 4. Woodpecker dashboard loads all assets at /ci/ (no 404s on JS/CSS) +# 5. Chat OAuth login flow works at /chat/login +# 6. Forward_auth on /chat/* rejects unauthenticated requests with 401 +# 7. Staging content loads at /staging/ +# +# Triggers: +# - Pull requests that modify edge-related files +# - Manual trigger for on-demand testing +# +# Environment variables (set in CI or via pipeline): +# EDGE_BASE_URL — Edge proxy URL (default: http://localhost) +# EDGE_TIMEOUT — Request timeout in seconds (default: 30) +# EDGE_MAX_RETRIES — Max retries per request (default: 3) +# +# When to run: +# - Any change to edge.hcl, docker/edge/, tools/edge-control/ +# - Any change to this pipeline file +# - Manual trigger for testing edge deployments + +when: + event: [pull_request, manual] + path: + - "nomad/jobs/edge.hcl" + - "docker/edge/**" + - "tools/edge-control/**" + - ".woodpecker/edge-subpath.yml" + - "tests/smoke-edge-subpath.sh" + +clone: + git: + image: alpine/git + commands: + - AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|") + - git clone --depth 1 "$AUTH_URL" . + - git fetch --depth 1 origin "$CI_COMMIT_REF" + - git checkout FETCH_HEAD + +steps: + - name: edge-subpath-smoke-test + image: alpine:3.19 + commands: + - apk add --no-cache bash curl jq + - bash tests/smoke-edge-subpath.sh diff --git a/AGENTS.md b/AGENTS.md index 97634a4..9c42667 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ - + # Disinto — Agent Instructions ## What this repo is diff --git a/architect/AGENTS.md b/architect/AGENTS.md index 61987ae..7286ee3 100644 --- a/architect/AGENTS.md +++ b/architect/AGENTS.md @@ -1,4 +1,4 @@ - + # Architect — Agent Instructions ## What this agent is diff --git a/bin/disinto b/bin/disinto index 3740898..c18ef0c 100755 --- a/bin/disinto +++ b/bin/disinto @@ -12,7 +12,6 @@ # disinto secrets Manage encrypted secrets # disinto run Run action in ephemeral runner container # disinto ci-logs [--step ] Read CI logs from Woodpecker SQLite -# disinto backup create Export factory state for migration # # Usage: # disinto init https://github.com/user/repo @@ -40,7 +39,6 @@ source "${FACTORY_ROOT}/lib/generators.sh" source "${FACTORY_ROOT}/lib/forge-push.sh" source "${FACTORY_ROOT}/lib/ci-setup.sh" source "${FACTORY_ROOT}/lib/release.sh" -source "${FACTORY_ROOT}/lib/backup.sh" source "${FACTORY_ROOT}/lib/claude-config.sh" # ── Helpers ────────────────────────────────────────────────────────────────── @@ -64,7 +62,6 @@ Usage: disinto hire-an-agent [--formula ] [--local-model ] [--model ] Hire a new agent (create user + .profile repo; re-run to rotate credentials) disinto agent Manage agent state (enable/disable) - disinto backup create Export factory state (issues + ops bundle) disinto edge [options] Manage edge tunnel registrations Edge subcommands: @@ -805,7 +802,6 @@ _disinto_init_nomad() { woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; agents) seed_name="agents" ;; chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; esac local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" if [ -x "$seed_script" ]; then @@ -987,7 +983,6 @@ _disinto_init_nomad() { woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; agents) seed_name="agents" ;; chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; esac local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" if [ -x "$seed_script" ]; then @@ -2896,23 +2891,6 @@ EOF esac } -# ── backup command ──────────────────────────────────────────────────────────── -# Usage: disinto backup create -disinto_backup() { - local subcmd="${1:-}" - shift || true - - case "$subcmd" in - create) - backup_create "$@" - ;; - *) - echo "Usage: disinto backup create " >&2 - exit 1 - ;; - esac -} - # ── Main dispatch ──────────────────────────────────────────────────────────── case "${1:-}" in @@ -2929,7 +2907,6 @@ case "${1:-}" in hire-an-agent) shift; disinto_hire_an_agent "$@" ;; agent) shift; disinto_agent "$@" ;; edge) shift; disinto_edge "$@" ;; - backup) shift; disinto_backup "$@" ;; -h|--help) usage ;; *) usage ;; esac diff --git a/dev/AGENTS.md b/dev/AGENTS.md index 5e6f085..c64551f 100644 --- a/dev/AGENTS.md +++ b/dev/AGENTS.md @@ -1,4 +1,4 @@ - + # Dev Agent **Role**: Implement issues autonomously — write code, push branches, address diff --git a/docker/edge/entrypoint-edge.sh b/docker/edge/entrypoint-edge.sh index 83131fb..6db96b7 100755 --- a/docker/edge/entrypoint-edge.sh +++ b/docker/edge/entrypoint-edge.sh @@ -173,15 +173,11 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}" sleep 1200 # 20 minutes done) & -# ── Load optional secrets from secrets/*.enc (#777) ──────────────────── -# Engagement collection (collect-engagement.sh) requires CADDY_ secrets to -# SCP access logs from a remote edge host. When age key or secrets dir is -# missing, or any secret fails to decrypt, log a warning and skip the cron. -# Caddy itself does not depend on these secrets. +# ── Load required secrets from secrets/*.enc (#777) ──────────────────── +# Edge container declares its required secrets; missing ones cause a hard fail. _AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt" _SECRETS_DIR="/opt/disinto/secrets" EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG" -EDGE_ENGAGEMENT_READY=0 # Assume not ready until proven otherwise _edge_decrypt_secret() { local enc_path="${_SECRETS_DIR}/${1}.enc" @@ -196,53 +192,47 @@ if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then export "$_secret_name=$_val" done if [ -n "$_missing" ]; then - echo "WARN: required engagement secrets missing from secrets/*.enc:${_missing}" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 - else - echo "edge: loaded required engagement secrets: ${EDGE_REQUIRED_SECRETS}" >&2 - EDGE_ENGAGEMENT_READY=1 + echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2 + echo " Run 'disinto secrets add ' for each missing secret." >&2 + echo " If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2 + exit 1 fi + echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2 else - echo "WARN: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — engagement secrets unavailable" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 + echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2 + echo " Ensure age is installed and secrets/*.enc files are present." >&2 + exit 1 fi # Start daily engagement collection cron loop in background (#745) # Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that # calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777). -# Guarded: only start if EDGE_ENGAGEMENT_READY=1. -if [ "$EDGE_ENGAGEMENT_READY" -eq 1 ]; then - (while true; do - # Calculate seconds until next 23:50 UTC - _now=$(date -u +%s) - _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) - if [ "$_target" -le "$_now" ]; then - _target=$(( _target + 86400 )) - fi - _sleep_secs=$(( _target - _now )) - echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 - sleep "$_sleep_secs" - _fetch_log="/tmp/caddy-access-log-fetch.log" - _ssh_key_file=$(mktemp) - printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" - chmod 0600 "$_ssh_key_file" - scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ - "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ - "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true - rm -f "$_ssh_key_file" - if [ -s "$_fetch_log" ]; then - CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ - | tee -a /opt/disinto-logs/collect-engagement.log || true - else - echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 - fi - rm -f "$_fetch_log" - done) & -else - echo "edge: collect-engagement cron skipped (EDGE_ENGAGEMENT_READY=0)" >&2 -fi +(while true; do + # Calculate seconds until next 23:50 UTC + _now=$(date -u +%s) + _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) + if [ "$_target" -le "$_now" ]; then + _target=$(( _target + 86400 )) + fi + _sleep_secs=$(( _target - _now )) + echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 + sleep "$_sleep_secs" + _fetch_log="/tmp/caddy-access-log-fetch.log" + _ssh_key_file=$(mktemp) + printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" + chmod 0600 "$_ssh_key_file" + scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ + "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true + rm -f "$_ssh_key_file" + if [ -s "$_fetch_log" ]; then + CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ + | tee -a /opt/disinto-logs/collect-engagement.log || true + else + echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 + fi + rm -f "$_fetch_log" +done) & # Nomad template renders Caddyfile to /local/Caddyfile via service discovery; # copy it into the expected location if present (compose uses the mounted path). diff --git a/docs/nomad-cutover-runbook.md b/docs/nomad-cutover-runbook.md deleted file mode 100644 index e0956cc..0000000 --- a/docs/nomad-cutover-runbook.md +++ /dev/null @@ -1,183 +0,0 @@ -# Nomad Cutover Runbook - -End-to-end procedure to cut over the disinto factory from docker-compose on -disinto-dev-box to Nomad on disinto-nomad-box. - -**Target**: disinto-nomad-box (10.10.10.216) becomes production; disinto-dev-box -stays warm for rollback. - -**Downtime budget**: <5 min blue-green flip. - -**Data scope**: Forgejo issues + disinto-ops git bundle only. Everything else is -regenerated or discarded. OAuth secrets are regenerated on fresh init (all -sessions invalidated). - ---- - -## 1. Pre-cutover readiness checklist - -- [ ] Nomad + Vault stack healthy on a fresh wipe+init (step 5 verified) -- [ ] Codeberg mirror current — `git log` parity between dev-box Forgejo and - Codeberg -- [ ] SSH key pair generated for nomad-box, registered on DO edge (see §4.6) -- [ ] Companion tools landed: - - `disinto backup create` (#1057) - - `disinto backup import` (#1058) -- [ ] Backup tarball produced and tested against a scratch LXC (see §3) - ---- - -## 2. Pre-cutover artifact: backup - -On disinto-dev-box: - -```bash -./bin/disinto backup create /tmp/disinto-backup-$(date +%Y%m%d).tar.gz -``` - -Copy the tarball to nomad-box (and optionally to a local workstation for -safekeeping): - -```bash -scp /tmp/disinto-backup-*.tar.gz nomad-box:/tmp/ -``` - ---- - -## 3. Pre-cutover dry-run - -On a throwaway LXC: - -```bash -lxc launch ubuntu:24.04 cutover-dryrun -# inside the container: -disinto init --backend=nomad --import-env .env --with edge -./bin/disinto backup import /tmp/disinto-backup-*.tar.gz -``` - -Verify: - -- Issue count matches source Forgejo -- disinto-ops repo refs match source bundle - -Destroy the LXC once satisfied: - -```bash -lxc delete cutover-dryrun --force -``` - ---- - -## 4. Cutover T-0 (operator executes; <5 min target) - -### 4.1 Stop dev-box services - -```bash -# On disinto-dev-box — stop, do NOT remove volumes (rollback needs them) -docker-compose stop -``` - -### 4.2 Provision nomad-box (if not already done) - -```bash -# On disinto-nomad-box -disinto init --backend=nomad --import-env .env --with edge -``` - -### 4.3 Import backup - -```bash -# On disinto-nomad-box -./bin/disinto backup import /tmp/disinto-backup-*.tar.gz -``` - -### 4.4 Configure Codeberg pull mirror - -Manual, one-time step in the new Forgejo UI: - -1. Create a mirror repository pointing at the Codeberg upstream -2. Confirm initial sync completes - -### 4.5 Claude login - -```bash -# On disinto-nomad-box -claude login -``` - -Set up Anthropic OAuth so agents can authenticate. - -### 4.6 Autossh tunnel swap - -> **Operator step** — cross-host, no dev-agent involvement. Do NOT automate. - -1. Stop the tunnel on dev-box: - ```bash - # On disinto-dev-box - systemctl stop reverse-tunnel - ``` - -2. Copy or regenerate the tunnel unit on nomad-box: - ```bash - # Copy from dev-box, or let init regenerate it - scp dev-box:/etc/systemd/system/reverse-tunnel.service \ - nomad-box:/etc/systemd/system/ - ``` - -3. Register nomad-box's public key on DO edge: - ```bash - # On DO edge box — same restricted-command as the dev-box key - echo "" >> /home/johba/.ssh/authorized_keys - ``` - -4. Start the tunnel on nomad-box: - ```bash - # On disinto-nomad-box - systemctl enable --now reverse-tunnel - ``` - -5. Verify end-to-end: - ```bash - curl https://self.disinto.ai/api/v1/version - # Should return the new box's Forgejo version - ``` - ---- - -## 5. Post-cutover smoke - -- [ ] `curl https://self.disinto.ai` → Forgejo welcome page -- [ ] Create a test PR → Woodpecker pipeline runs → agents assign and work -- [ ] Claude chat login via Forgejo OAuth succeeds - ---- - -## 6. Rollback (if any step 4 gate fails) - -1. Stop the tunnel on nomad-box: - ```bash - systemctl stop reverse-tunnel # on nomad-box - ``` - -2. Restore the tunnel on dev-box: - ```bash - systemctl start reverse-tunnel # on dev-box - ``` - -3. Bring dev-box services back up: - ```bash - docker-compose up -d # on dev-box - ``` - -4. DO Caddy config is unchanged — traffic restores in <5 min. - -5. File a post-mortem issue. Keep nomad-box state intact for debugging. - ---- - -## 7. Post-stable cleanup (T+1 week) - -- `docker-compose down -v` on dev-box -- Archive `/var/lib/docker/volumes/disinto_*` to cold storage -- Delete disinto-dev-box LXC or keep as permanent rollback reserve (operator - decision) diff --git a/gardener/AGENTS.md b/gardener/AGENTS.md index 63544c5..5dcd12f 100644 --- a/gardener/AGENTS.md +++ b/gardener/AGENTS.md @@ -1,4 +1,4 @@ - + # Gardener Agent **Role**: Backlog grooming — detect duplicate issues, missing acceptance diff --git a/gardener/dust.jsonl b/gardener/dust.jsonl index e69de29..09af349 100644 --- a/gardener/dust.jsonl +++ b/gardener/dust.jsonl @@ -0,0 +1 @@ +{"issue":850,"group":"lib/generators.sh","title":"compose dup-detection smoke CI failures","reason":"4+ consecutive ci_exhausted failures across PRs #872 #908 #971; planner flagged for human re-scope","ts":"2026-04-19T00:00:00Z"} diff --git a/gardener/pending-actions.json b/gardener/pending-actions.json index 5e481fa..9827786 100644 --- a/gardener/pending-actions.json +++ b/gardener/pending-actions.json @@ -1,23 +1,8 @@ [ { - "action": "add_label", - "issue": 1047, - "label": "backlog" - }, - { - "action": "add_label", - "issue": 1047, - "label": "priority" - }, - { - "action": "add_label", - "issue": 1044, - "label": "backlog" - }, - { - "action": "remove_label", + "action": "edit_body", "issue": 1025, - "label": "blocked" + "body": "## Goal\nVerify that Forgejo, Woodpecker, and chat all function correctly when served\nunder /forge/, /ci/, and /chat/ subpaths on a single domain. Catch redirect\nloops, OAuth callback failures, and asset 404s before they hit production.\n\n## Sprint\nPart of sprint [edge-subpath-chat](https://forgejo:3000/disinto-admin/disinto-ops/pulls/37) — vision issue #623.\n\n## Acceptance criteria\n- [ ] Forgejo login at /forge/ completes without redirect loops\n- [ ] Forgejo OAuth callback for Woodpecker succeeds under subpath\n- [ ] Woodpecker dashboard loads all assets at /ci/ (no 404s on JS/CSS)\n- [ ] Chat OAuth login flow works at /chat/login\n- [ ] Forward_auth on /chat/* rejects unauthenticated requests with 401\n- [ ] Staging content loads at /staging/\n- [ ] Root / redirects to /forge/\n- [ ] CI pipeline added to .woodpecker/ to run this test on edge-related changes\n\n## Affected files\n- `nomad/jobs/edge.hcl` — edge Caddy routing config under test\n- `docker/edge/` — edge container and Caddyfile template\n- `tools/edge-control/register.sh` — route registration\n- `.woodpecker/` — CI pipeline for edge smoke test\n\n## Dependencies\nNone — first issue in sprint.\n" }, { "action": "add_label", @@ -25,23 +10,33 @@ "label": "backlog" }, { - "action": "comment", - "issue": 1025, - "body": "Gardener: removing `blocked` — fix path is well-defined (Option 1: static-checks-only pipeline). Promoting to backlog for next dev pick-up. Dev must follow the acceptance criteria literally — no live service curls, static checks only." - }, - { - "action": "remove_label", - "issue": 850, - "label": "blocked" + "action": "edit_body", + "issue": 1026, + "body": "## Goal\nReplace the blocking one-shot claude --print invocation in the chat backend with\na WebSocket connection that streams tokens to the UI as they arrive.\n\n## Sprint\nPart of sprint [edge-subpath-chat](https://forgejo:3000/disinto-admin/disinto-ops/pulls/37) — vision issue #623.\n\n## Acceptance criteria\n- [ ] /chat/ws endpoint accepts WebSocket upgrade with valid session cookie\n- [ ] /chat/ws rejects upgrade if session cookie is missing or expired\n- [ ] Chat backend streams claude output over WebSocket as text frames\n- [ ] UI renders tokens incrementally as they arrive\n- [ ] Rate limiting still enforced on WebSocket messages\n- [ ] Caddy proxies WebSocket upgrade correctly through /chat/ws with forward_auth\n\n## Affected files\n- `docker/chat/server.py` — chat backend WebSocket endpoint\n- `docker/chat/ui/` — frontend WebSocket client rendering\n- `nomad/jobs/edge.hcl` — Caddy WebSocket proxy config\n- `nomad/jobs/chat.hcl` — chat Nomad job\n\n## Dependencies\n- Depends on #1025 — subpath routing smoke test\n" }, { "action": "add_label", - "issue": 850, + "issue": 1026, "label": "backlog" }, { - "action": "comment", - "issue": 850, - "body": "Gardener: removing `blocked` — 5th attempt recipe is at the top of this issue. Dev must follow the recipe exactly (call `_generate_compose_impl` directly in isolated FACTORY_ROOT, do NOT use `bin/disinto init`). Do not copy patterns from prior PRs." + "action": "edit_body", + "issue": 1027, + "body": "## Goal\nGive the chat container Claude session read-write access to the project working\ntree so the operator can inspect, explain, or modify code — scoped to that tree\nonly, with no access to factory internals, secrets, or Docker socket.\n\n## Sprint\nPart of sprint [edge-subpath-chat](https://forgejo:3000/disinto-admin/disinto-ops/pulls/37) — vision issue #623.\n\n## Acceptance criteria\n- [ ] Chat container bind-mounts the project working tree as a named volume\n- [ ] Claude invocation in server.py sets cwd to the workspace directory\n- [ ] Claude permission mode is acceptEdits (not bypassPermissions)\n- [ ] verify-chat-sandbox.sh updated to assert workspace mount exists\n- [ ] Compose generator adds the workspace volume conditionally\n\n## Affected files\n- `docker/chat/server.py` — Claude invocation and cwd setup\n- `tools/edge-control/verify-chat-sandbox.sh` — sandbox verification\n- `lib/generators.sh` — Compose generator workspace volume\n- `nomad/jobs/chat.hcl` — chat container bind-mount config\n\n## Dependencies\n- Depends on #1025 — subpath routing smoke test\n" + }, + { + "action": "add_label", + "issue": 1027, + "label": "backlog" + }, + { + "action": "edit_body", + "issue": 1028, + "body": "## Goal\nIf the smoke test reveals unfixable subpath issues, automate the pivot to\nper-service subdomains so the switch is a single config change.\n\n## Sprint\nPart of sprint [edge-subpath-chat](https://forgejo:3000/disinto-admin/disinto-ops/pulls/37) — vision issue #623.\n\n## Acceptance criteria\n- [ ] generators.sh _generate_caddyfile_impl accepts EDGE_ROUTING_MODE env var\n- [ ] In subdomain mode, Caddyfile emits four host blocks per edge-routing-fallback.md\n- [ ] register.sh registers additional subdomain routes when EDGE_ROUTING_MODE=subdomain\n- [ ] OAuth redirect URIs in ci-setup.sh respect routing mode\n- [ ] .env template documents EDGE_ROUTING_MODE with a comment referencing the fallback doc\n\n## Affected files\n- `lib/generators.sh` — _generate_caddyfile_impl routing mode switch\n- `tools/edge-control/register.sh` — subdomain route registration\n- `lib/ci-setup.sh` — OAuth redirect URI handling\n- `projects/*.toml.example` — .env template documentation\n\n## Dependencies\n- Depends on #1025 — subpath routing smoke test\n" + }, + { + "action": "add_label", + "issue": 1028, + "label": "backlog" } ] diff --git a/lib/AGENTS.md b/lib/AGENTS.md index feaee18..09f18b1 100644 --- a/lib/AGENTS.md +++ b/lib/AGENTS.md @@ -1,4 +1,4 @@ - + # Shared Helpers (`lib/`) All agents source `lib/env.sh` as their first action. Additional helpers are @@ -35,4 +35,4 @@ sourced as needed. | `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) | | `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) | | `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_ensure_kv_v2(MOUNT, [LOG_PREFIX])` — idempotent KV v2 mount assertion: enables mount if absent, fails loudly if present as wrong type/version. Extracted from all `vault-seed-*.sh` scripts to eliminate dup-detector violations. Respects `DRY_RUN=1`. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. `_hvault_seed_key(PATH, KEY, [GENERATOR])` — seed one KV key if absent; reads existing data and merges to preserve sibling keys (KV v2 replaces atomically); returns 0=created, 1=unchanged, 2=API error (#992). All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh`, `tools/vault-seed-*.sh` | -| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling; `HOST_VOLUME_DIRS` array now includes `/srv/disinto/docker` (for staging file-server, S5.2, #989, #992). `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). `wp-oauth-register.sh` — S3.3 script that creates the Woodpecker OAuth2 app in Forgejo and stores `forgejo_client`/`forgejo_secret` in Vault KV v2 at `kv/disinto/shared/woodpecker`; idempotent (skips if app or secrets already present); called by `bin/disinto --with woodpecker`. `deploy.sh` — S4 dependency-ordered Nomad job deploy + health-wait; takes a list of jobspec basenames, submits each to Nomad and polls until healthy before proceeding to the next; supports `--dry-run` and per-job timeout overrides via `JOB_READY_TIMEOUT_`; global default timeout `JOB_READY_TIMEOUT_SECS` is 360s (raised from 240s for chat cold-start, #1036); invoked by `bin/disinto --with ` and `cluster-up.sh`; deploy order now covers staging, chat, edge (S5.5, #992). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` | +| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling; `HOST_VOLUME_DIRS` array now includes `/srv/disinto/docker` (for staging file-server, S5.2, #989, #992). `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). `wp-oauth-register.sh` — S3.3 script that creates the Woodpecker OAuth2 app in Forgejo and stores `forgejo_client`/`forgejo_secret` in Vault KV v2 at `kv/disinto/shared/woodpecker`; idempotent (skips if app or secrets already present); called by `bin/disinto --with woodpecker`. `deploy.sh` — S4 dependency-ordered Nomad job deploy + health-wait; takes a list of jobspec basenames, submits each to Nomad and polls until healthy before proceeding to the next; supports `--dry-run` and per-job timeout overrides via `JOB_READY_TIMEOUT_`; invoked by `bin/disinto --with ` and `cluster-up.sh`; deploy order now covers staging, chat, edge (S5.5, #992). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` | diff --git a/lib/agent-sdk.sh b/lib/agent-sdk.sh index b968222..2522655 100644 --- a/lib/agent-sdk.sh +++ b/lib/agent-sdk.sh @@ -52,9 +52,8 @@ claude_run_with_watchdog() { out_file=$(mktemp) || return 1 trap 'rm -f "$out_file"' RETURN - # Start claude in new process group (setsid creates new session, $pid is PGID leader) - # All children of claude will inherit this process group - setsid "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" & + # Start claude in background, capturing stdout to temp file + "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" & pid=$! # Background watchdog: poll for final result marker @@ -85,12 +84,12 @@ claude_run_with_watchdog() { sleep "$grace" if kill -0 "$pid" 2>/dev/null; then log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM" - kill -TERM -- "-$pid" 2>/dev/null || true + kill -TERM "$pid" 2>/dev/null || true # Give it a moment to clean up sleep 5 if kill -0 "$pid" 2>/dev/null; then log "watchdog: force kill after SIGTERM timeout" - kill -KILL -- "-$pid" 2>/dev/null || true + kill -KILL "$pid" 2>/dev/null || true fi fi fi @@ -101,16 +100,16 @@ claude_run_with_watchdog() { timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null rc=$? - # Clean up the watchdog (target process group if it spawned children) - kill -- "-$grace_pid" 2>/dev/null || true + # Clean up the watchdog + kill "$grace_pid" 2>/dev/null || true wait "$grace_pid" 2>/dev/null || true - # When timeout fires (rc=124), explicitly kill the orphaned claude process group + # When timeout fires (rc=124), explicitly kill the orphaned claude process # tail --pid is a passive waiter, not a supervisor if [ "$rc" -eq 124 ]; then - kill -TERM -- "-$pid" 2>/dev/null || true + kill "$pid" 2>/dev/null || true sleep 1 - kill -KILL -- "-$pid" 2>/dev/null || true + kill -KILL "$pid" 2>/dev/null || true fi # Output the captured stdout diff --git a/lib/backup.sh b/lib/backup.sh deleted file mode 100644 index 8d7a827..0000000 --- a/lib/backup.sh +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# disinto backup — export factory state for migration -# -# Usage: source this file, then call backup_create -# Requires: FORGE_URL, FORGE_TOKEN, FORGE_REPO, FORGE_OPS_REPO, OPS_REPO_ROOT -# ============================================================================= -set -euo pipefail - -# Fetch all issues (open + closed) for a repo slug and emit the normalized JSON array. -# Usage: _backup_fetch_issues -_backup_fetch_issues() { - local repo_slug="$1" - local api_url="${FORGE_API_BASE}/repos/${repo_slug}" - - local all_issues="[]" - for state in open closed; do - local page=1 - while true; do - local page_items - page_items=$(curl -sf -X GET \ - -H "Authorization: token ${FORGE_TOKEN}" \ - -H "Content-Type: application/json" \ - "${api_url}/issues?state=${state}&type=issues&limit=50&page=${page}") || { - echo "ERROR: failed to fetch ${state} issues from ${repo_slug} (page ${page})" >&2 - return 1 - } - local count - count=$(printf '%s' "$page_items" | jq 'length' 2>/dev/null) || count=0 - [ -z "$count" ] && count=0 - [ "$count" -eq 0 ] && break - all_issues=$(printf '%s\n%s' "$all_issues" "$page_items" | jq -s 'add') - [ "$count" -lt 50 ] && break - page=$((page + 1)) - done - done - - # Normalize to the schema: number, title, body, labels, state - printf '%s' "$all_issues" | jq '[.[] | { - number: .number, - title: .title, - body: .body, - labels: [.labels[]?.name], - state: .state - }] | sort_by(.number)' -} - -# Create a backup tarball of factory state. -# Usage: backup_create -backup_create() { - local outfile="${1:-}" - if [ -z "$outfile" ]; then - echo "Error: output file required" >&2 - echo "Usage: disinto backup create " >&2 - return 1 - fi - - # Resolve to absolute path before cd-ing into tmpdir - case "$outfile" in - /*) ;; - *) outfile="$(pwd)/${outfile}" ;; - esac - - # Validate required env - : "${FORGE_URL:?FORGE_URL must be set}" - : "${FORGE_TOKEN:?FORGE_TOKEN must be set}" - : "${FORGE_REPO:?FORGE_REPO must be set}" - - local forge_ops_repo="${FORGE_OPS_REPO:-${FORGE_REPO}-ops}" - local ops_repo_root="${OPS_REPO_ROOT:-}" - - if [ -z "$ops_repo_root" ] || [ ! -d "$ops_repo_root/.git" ]; then - echo "Error: OPS_REPO_ROOT (${ops_repo_root:-}) is not a valid git repo" >&2 - return 1 - fi - - local tmpdir - tmpdir=$(mktemp -d) - trap 'rm -rf "$tmpdir"' EXIT - - local project_name="${FORGE_REPO##*/}" - - echo "=== disinto backup create ===" - echo "Forge: ${FORGE_URL}" - echo "Repos: ${FORGE_REPO}, ${forge_ops_repo}" - - # ── 1. Export issues ────────────────────────────────────────────────────── - mkdir -p "${tmpdir}/issues" - - echo "Fetching issues for ${FORGE_REPO}..." - _backup_fetch_issues "$FORGE_REPO" > "${tmpdir}/issues/${project_name}.json" - local main_count - main_count=$(jq 'length' "${tmpdir}/issues/${project_name}.json") - echo " ${main_count} issues exported" - - echo "Fetching issues for ${forge_ops_repo}..." - _backup_fetch_issues "$forge_ops_repo" > "${tmpdir}/issues/${project_name}-ops.json" - local ops_count - ops_count=$(jq 'length' "${tmpdir}/issues/${project_name}-ops.json") - echo " ${ops_count} issues exported" - - # ── 2. Git bundle of ops repo ──────────────────────────────────────────── - mkdir -p "${tmpdir}/repos" - - echo "Creating git bundle for ${forge_ops_repo}..." - git -C "$ops_repo_root" bundle create "${tmpdir}/repos/${project_name}-ops.bundle" --all 2>&1 - echo " bundle created ($(du -h "${tmpdir}/repos/${project_name}-ops.bundle" | cut -f1))" - - # ── 3. Metadata ────────────────────────────────────────────────────────── - local created_at - created_at=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - - jq -n \ - --arg created_at "$created_at" \ - --arg source_host "$(hostname)" \ - --argjson schema_version 1 \ - --arg forgejo_url "$FORGE_URL" \ - '{ - created_at: $created_at, - source_host: $source_host, - schema_version: $schema_version, - forgejo_url: $forgejo_url - }' > "${tmpdir}/metadata.json" - - # ── 4. Pack tarball ────────────────────────────────────────────────────── - echo "Creating tarball: ${outfile}" - tar -czf "$outfile" -C "$tmpdir" metadata.json issues repos - local size - size=$(du -h "$outfile" | cut -f1) - echo "=== Backup complete: ${outfile} (${size}) ===" - - # Clean up before returning — the EXIT trap references the local $tmpdir - # which goes out of scope after return, causing 'unbound variable' under set -u. - trap - EXIT - rm -rf "$tmpdir" -} diff --git a/lib/ci-helpers.sh b/lib/ci-helpers.sh index 6afe97b..11c668e 100644 --- a/lib/ci-helpers.sh +++ b/lib/ci-helpers.sh @@ -247,31 +247,6 @@ ci_promote() { echo "$new_num" } -# ci_get_step_logs -# Fetches logs for a single CI step via the Woodpecker API. -# Requires: WOODPECKER_REPO_ID, woodpecker_api() (from env.sh) -# Returns: 0 on success, 1 on failure. Outputs log text to stdout. -# -# Usage: -# ci_get_step_logs 1423 5 # Get logs for step ID 5 in pipeline 1423 -ci_get_step_logs() { - local pipeline_num="$1" step_id="$2" - - if [ -z "$pipeline_num" ] || [ -z "$step_id" ]; then - echo "Usage: ci_get_step_logs " >&2 - return 1 - fi - - if [ -z "${WOODPECKER_REPO_ID:-}" ] || [ "${WOODPECKER_REPO_ID}" = "0" ]; then - echo "ERROR: WOODPECKER_REPO_ID not set or zero" >&2 - return 1 - fi - - woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${pipeline_num}/${step_id}" \ - --max-time 15 2>/dev/null \ - | jq -r '.[].data // empty' 2>/dev/null -} - # ci_get_logs [--step ] # Reads CI logs from the Woodpecker SQLite database. # Requires: WOODPECKER_DATA_DIR env var or mounted volume at /woodpecker-data diff --git a/lib/generators.sh b/lib/generators.sh index eb223e8..77af9a7 100644 --- a/lib/generators.sh +++ b/lib/generators.sh @@ -26,28 +26,6 @@ PROJECT_NAME="${PROJECT_NAME:-project}" # PRIMARY_BRANCH defaults to main (env.sh may have set it to 'master') PRIMARY_BRANCH="${PRIMARY_BRANCH:-main}" -# Track service names for duplicate detection -declare -A _seen_services -declare -A _service_sources - -# Record a service name and its source; return 0 if unique, 1 if duplicate -_record_service() { - local service_name="$1" - local source="$2" - - if [ -n "${_seen_services[$service_name]:-}" ]; then - local original_source="${_service_sources[$service_name]}" - echo "ERROR: Duplicate service name '$service_name' detected —" >&2 - echo " '$service_name' emitted twice — from $original_source and from $source" >&2 - echo " Remove one of the conflicting activations to proceed." >&2 - return 1 - fi - - _seen_services[$service_name]=1 - _service_sources[$service_name]="$source" - return 0 -} - # Helper: extract woodpecker_repo_id from a project TOML file # Returns empty string if not found or file doesn't exist _get_woodpecker_repo_id() { @@ -119,16 +97,6 @@ _generate_local_model_services() { POLL_INTERVAL) poll_interval_val="$value" ;; ---) if [ -n "$service_name" ] && [ -n "$base_url" ]; then - # Record service for duplicate detection using the full service name - local full_service_name="agents-${service_name}" - local toml_basename - toml_basename=$(basename "$toml") - if ! _record_service "$full_service_name" "[agents.$service_name] in projects/$toml_basename"; then - # Duplicate detected — clean up and abort - rm -f "$temp_file" - return 1 - fi - # Per-agent FORGE_TOKEN / FORGE_PASS lookup (#834 Gap 3). # Two hired llama agents must not share the same Forgejo identity, # so we key the env-var lookup by forge_user (which hire-agent.sh @@ -313,21 +281,6 @@ _generate_compose_impl() { return 0 fi - # Reset duplicate detection state for fresh run - _seen_services=() - _service_sources=() - - # Initialize duplicate detection with base services defined in the template - _record_service "forgejo" "base compose template" || return 1 - _record_service "woodpecker" "base compose template" || return 1 - _record_service "woodpecker-agent" "base compose template" || return 1 - _record_service "agents" "base compose template" || return 1 - _record_service "runner" "base compose template" || return 1 - _record_service "edge" "base compose template" || return 1 - _record_service "staging" "base compose template" || return 1 - _record_service "staging-deploy" "base compose template" || return 1 - _record_service "chat" "base compose template" || return 1 - # Extract primary woodpecker_repo_id from project TOML files local wp_repo_id wp_repo_id=$(_get_primary_woodpecker_repo_id) @@ -405,9 +358,6 @@ services: WOODPECKER_SERVER: localhost:9000 WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-} WOODPECKER_GRPC_SECURE: "false" - WOODPECKER_GRPC_KEEPALIVE_TIME: "10s" - WOODPECKER_GRPC_KEEPALIVE_TIMEOUT: "20s" - WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS: "true" WOODPECKER_HEALTHCHECK_ADDR: ":3333" WOODPECKER_BACKEND_DOCKER_NETWORK: ${WOODPECKER_CI_NETWORK:-disinto_disinto-net} WOODPECKER_MAX_WORKFLOWS: 1 @@ -486,76 +436,6 @@ services: COMPOSEEOF - # ── Conditional agents-llama block (ENABLE_LLAMA_AGENT=1) ────────────── - # This legacy flag was removed in #846 but kept for duplicate detection testing - if [ "${ENABLE_LLAMA_AGENT:-0}" = "1" ]; then - if ! _record_service "agents-llama" "ENABLE_LLAMA_AGENT=1"; then - return 1 - fi - cat >> "$compose_file" <<'COMPOSEEOF' - - agents-llama: - image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest} - container_name: disinto-agents-llama - restart: unless-stopped - security_opt: - - apparmor=unconfined - volumes: - - agent-data:/home/agent/data - - project-repos:/home/agent/repos - - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro - - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - - woodpecker-data:/woodpecker-data:ro - - ./projects:/home/agent/disinto/projects:ro - - ./.env:/home/agent/disinto/.env:ro - - ./state:/home/agent/disinto/state - environment: - FORGE_URL: http://forgejo:3000 - FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto} - FORGE_TOKEN: ${FORGE_TOKEN:-} - FORGE_REVIEW_TOKEN: ${FORGE_REVIEW_TOKEN:-} - FORGE_PLANNER_TOKEN: ${FORGE_PLANNER_TOKEN:-} - FORGE_GARDENER_TOKEN: ${FORGE_GARDENER_TOKEN:-} - FORGE_VAULT_TOKEN: ${FORGE_VAULT_TOKEN:-} - FORGE_SUPERVISOR_TOKEN: ${FORGE_SUPERVISOR_TOKEN:-} - FORGE_PREDICTOR_TOKEN: ${FORGE_PREDICTOR_TOKEN:-} - FORGE_ARCHITECT_TOKEN: ${FORGE_ARCHITECT_TOKEN:-} - FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-} - WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-} - CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200} - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1} - ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} - FORGE_PASS: ${FORGE_PASS:-} - FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-} - FACTORY_REPO: ${FORGE_REPO:-disinto-admin/disinto} - DISINTO_CONTAINER: "1" - PROJECT_NAME: ${PROJECT_NAME:-project} - PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project} - WOODPECKER_DATA_DIR: /woodpecker-data - WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID" - CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config} - POLL_INTERVAL: ${POLL_INTERVAL:-300} - GARDENER_INTERVAL: ${GARDENER_INTERVAL:-21600} - ARCHITECT_INTERVAL: ${ARCHITECT_INTERVAL:-21600} - PLANNER_INTERVAL: ${PLANNER_INTERVAL:-43200} - healthcheck: - test: ["CMD", "pgrep", "-f", "entrypoint.sh"] - interval: 60s - timeout: 5s - retries: 3 - start_period: 30s - depends_on: - forgejo: - condition: service_healthy - woodpecker: - condition: service_started - networks: - - disinto-net - -COMPOSEEOF - fi - # Resume the rest of the compose file (runner onward) cat >> "$compose_file" <<'COMPOSEEOF' @@ -751,10 +631,7 @@ COMPOSEEOF fi # Append local-model agent services if any are configured - if ! _generate_local_model_services "$compose_file"; then - echo "ERROR: Failed to generate local-model agent services. See errors above." >&2 - return 1 - fi + _generate_local_model_services "$compose_file" # Resolve the Claude CLI binary path and persist as CLAUDE_BIN_DIR in .env. # Only used by reproduce and edge services which still use host-mounted CLI. diff --git a/lib/init/nomad/deploy.sh b/lib/init/nomad/deploy.sh index f9a3805..7cf9278 100755 --- a/lib/init/nomad/deploy.sh +++ b/lib/init/nomad/deploy.sh @@ -16,7 +16,7 @@ # Environment: # REPO_ROOT — absolute path to repo root (defaults to parent of # this script's parent directory) -# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360) +# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 240) # JOB_READY_TIMEOUT_ — per-job timeout override (e.g., # JOB_READY_TIMEOUT_FORGEJO=300) # @@ -33,7 +33,7 @@ set -euo pipefail # ── Configuration ──────────────────────────────────────────────────────────── SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}" -JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}" +JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-240}" DRY_RUN=0 diff --git a/lib/issue-lifecycle.sh b/lib/issue-lifecycle.sh index 25f2c6b..1ad3239 100644 --- a/lib/issue-lifecycle.sh +++ b/lib/issue-lifecycle.sh @@ -157,10 +157,9 @@ issue_claim() { return 1 fi - local ip_id bl_id bk_id + local ip_id bl_id ip_id=$(_ilc_in_progress_id) bl_id=$(_ilc_backlog_id) - bk_id=$(_ilc_blocked_id) if [ -n "$ip_id" ]; then curl -sf -X POST \ -H "Authorization: token ${FORGE_TOKEN}" \ @@ -173,12 +172,6 @@ issue_claim() { -H "Authorization: token ${FORGE_TOKEN}" \ "${FORGE_API}/issues/${issue}/labels/${bl_id}" >/dev/null 2>&1 || true fi - # Clear blocked label on re-claim — starting work is implicit resolution of prior block - if [ -n "$bk_id" ]; then - curl -sf -X DELETE \ - -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_API}/issues/${issue}/labels/${bk_id}" >/dev/null 2>&1 || true - fi _ilc_log "claimed issue #${issue}" return 0 } diff --git a/lib/pr-lifecycle.sh b/lib/pr-lifecycle.sh index bca08f1..e097f34 100644 --- a/lib/pr-lifecycle.sh +++ b/lib/pr-lifecycle.sh @@ -429,100 +429,19 @@ pr_walk_to_merge() { _prl_log "CI failed — invoking agent (attempt ${ci_fix_count}/${max_ci_fixes})" - # Build per-workflow/per-step CI diagnostics prompt - local ci_prompt_body="" - local passing_workflows="" - local built_diagnostics=false - - if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${WOODPECKER_REPO_ID:-}" ]; then - local pip_json - pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_PR_CI_PIPELINE}" 2>/dev/null) || pip_json="" - - if [ -n "$pip_json" ]; then - local wf_count - wf_count=$(printf '%s' "$pip_json" | jq '[.workflows[]?] | length' 2>/dev/null) || wf_count=0 - - if [ "$wf_count" -gt 0 ]; then - built_diagnostics=true - local wf_idx=0 - while [ "$wf_idx" -lt "$wf_count" ]; do - local wf_name wf_state - wf_name=$(printf '%s' "$pip_json" | jq -r ".workflows[$wf_idx].name // \"workflow-$wf_idx\"" 2>/dev/null) - wf_state=$(printf '%s' "$pip_json" | jq -r ".workflows[$wf_idx].state // \"unknown\"" 2>/dev/null) - - if [ "$wf_state" = "failure" ] || [ "$wf_state" = "error" ] || [ "$wf_state" = "killed" ]; then - # Collect failed children for this workflow - local failed_children - failed_children=$(printf '%s' "$pip_json" | jq -r " - .workflows[$wf_idx].children[]? | - select(.state == \"failure\" or .state == \"error\" or .state == \"killed\") | - \"\(.name)\t\(.exit_code)\t\(.pid)\"" 2>/dev/null) || failed_children="" - - ci_prompt_body="${ci_prompt_body} ---- Failed workflow: ${wf_name} ---" - if [ -n "$failed_children" ]; then - while IFS=$'\t' read -r step_name step_exit step_pid; do - [ -z "$step_name" ] && continue - local exit_annotation="" - case "$step_exit" in - 126) exit_annotation=" (permission denied or not executable)" ;; - 127) exit_annotation=" (command not found)" ;; - 128) exit_annotation=" (invalid exit argument / signal+128)" ;; - esac - ci_prompt_body="${ci_prompt_body} - Step: ${step_name} - Exit code: ${step_exit}${exit_annotation}" - - # Fetch per-step logs - if [ -n "$step_pid" ] && [ "$step_pid" != "null" ]; then - local step_logs - step_logs=$(ci_get_step_logs "$_PR_CI_PIPELINE" "$step_pid" 2>/dev/null | tail -50) || step_logs="" - if [ -n "$step_logs" ]; then - ci_prompt_body="${ci_prompt_body} - Log tail (last 50 lines): -\`\`\` -${step_logs} -\`\`\`" - fi - fi - done <<< "$failed_children" - else - ci_prompt_body="${ci_prompt_body} - (no failed step details available)" - fi - else - # Track passing/other workflows - if [ -n "$passing_workflows" ]; then - passing_workflows="${passing_workflows}, ${wf_name}" - else - passing_workflows="${wf_name}" - fi - fi - wf_idx=$((wf_idx + 1)) - done - fi - fi + # Get CI logs from SQLite database if available + local ci_logs="" + if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${FACTORY_ROOT:-}" ]; then + ci_logs=$(ci_get_logs "$_PR_CI_PIPELINE" 2>/dev/null | tail -50) || ci_logs="" fi - # Fallback: use legacy log fetch if per-workflow diagnostics unavailable - if [ "$built_diagnostics" = false ]; then - local ci_logs="" - if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${FACTORY_ROOT:-}" ]; then - ci_logs=$(ci_get_logs "$_PR_CI_PIPELINE" 2>/dev/null | tail -50) || ci_logs="" - fi - if [ -n "$ci_logs" ]; then - ci_prompt_body=" + local logs_section="" + if [ -n "$ci_logs" ]; then + logs_section=" CI Log Output (last 50 lines): \`\`\` ${ci_logs} -\`\`\`" - fi - fi - - local passing_line="" - if [ -n "$passing_workflows" ]; then - passing_line=" -Passing workflows (do not modify): ${passing_workflows} +\`\`\` " fi @@ -531,10 +450,9 @@ Passing workflows (do not modify): ${passing_workflows} Pipeline: #${_PR_CI_PIPELINE:-?} Failure type: ${_PR_CI_FAILURE_TYPE:-unknown} -${passing_line} + Error log: -${_PR_CI_ERROR_LOG:-No logs available.} -${ci_prompt_body} +${_PR_CI_ERROR_LOG:-No logs available.}${logs_section} Fix the issue, run tests, commit, rebase on ${PRIMARY_BRANCH}, and push: git fetch ${remote} ${PRIMARY_BRANCH} && git rebase ${remote}/${PRIMARY_BRANCH} diff --git a/nomad/AGENTS.md b/nomad/AGENTS.md index 729214e..57667bc 100644 --- a/nomad/AGENTS.md +++ b/nomad/AGENTS.md @@ -1,4 +1,4 @@ - + # nomad/ — Agent Instructions Nomad + Vault HCL for the factory's single-node cluster. These files are @@ -21,7 +21,7 @@ see issues #821–#992 for the step breakdown. | `jobs/agents.hcl` | submitted via `lib/init/nomad/deploy.sh` | All 7 agent roles (dev, review, gardener, planner, predictor, supervisor, architect) + llama variant; Vault-templated bot tokens via `service-agents` policy; `force_pull = false` — image is built locally by `bin/disinto --with agents`, no registry (S4.1, S4-fix-2, S4-fix-5, #955, #972, #978) | | `jobs/staging.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy file-server mounting `docker/` as `/srv/site:ro`; no Vault integration; **dynamic host port** (no static 80 — edge owns 80/443, collision fixed in S5-fix-7 #1018); edge discovers via Nomad service registration (S5.2, #989) | | `jobs/chat.hcl` | submitted via `lib/init/nomad/deploy.sh` | Claude chat UI; custom `disinto/chat:local` image; sandbox hardening (cap_drop ALL, **tmpfs via mount block** not `tmpfs=` arg — S5-fix-5 #1012, pids_limit 128); Vault-templated OAuth secrets via `service-chat` policy (S5.2, #989) | -| `jobs/edge.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy reverse proxy + dispatcher sidecar; routes /forge, /woodpecker, /staging, /chat; uses `disinto/edge:local` image built by `bin/disinto --with edge`; **both Caddy and dispatcher tasks use `network_mode = "host"`** — upstreams are `127.0.0.1:` (forgejo :3000, woodpecker :8000, chat :8080), not Docker hostnames (#1031, #1034); `FORGE_URL` rendered via Nomad service discovery template (not static env) to handle bridge vs. host network differences (#1034); dispatcher Vault secret path changed to `kv/data/disinto/shared/ops-repo` (#1041); Vault-templated ops-repo creds via `service-dispatcher` policy (S5.1, #988) | +| `jobs/edge.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy reverse proxy + dispatcher sidecar; routes /forge, /woodpecker, /staging, /chat; uses `disinto/edge:local` image built by `bin/disinto --with edge`; Vault-templated ops-repo creds via `service-dispatcher` policy (S5.1, #988) | Nomad auto-merges every `*.hcl` under `-config=/etc/nomad.d/`, so the split between `server.hcl` and `client.hcl` is for readability, not diff --git a/nomad/jobs/edge.hcl b/nomad/jobs/edge.hcl index bf82b3d..4a495d9 100644 --- a/nomad/jobs/edge.hcl +++ b/nomad/jobs/edge.hcl @@ -123,19 +123,6 @@ job "edge" { # ── Caddyfile via Nomad service discovery (S5-fix-7, issue #1018) ──── # Renders staging upstream from Nomad service registration instead of # hardcoded staging:80. Caddy picks up /local/Caddyfile via entrypoint. - # Forge URL via Nomad service discovery (issue #1034) — resolves forgejo - # service address/port dynamically for bridge network compatibility. - template { - destination = "local/forge.env" - env = true - change_mode = "restart" - data = < + # Planner Agent **Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints), diff --git a/predictor/AGENTS.md b/predictor/AGENTS.md index f67d9d0..a263066 100644 --- a/predictor/AGENTS.md +++ b/predictor/AGENTS.md @@ -1,4 +1,4 @@ - + # Predictor Agent **Role**: Abstract adversary (the "goblin"). Runs a 2-step formula diff --git a/review/AGENTS.md b/review/AGENTS.md index 8709cfb..24606d1 100644 --- a/review/AGENTS.md +++ b/review/AGENTS.md @@ -1,4 +1,4 @@ - + # Review Agent **Role**: AI-powered PR review — post structured findings and formal diff --git a/review/review-pr.sh b/review/review-pr.sh index 09f6cb6..091025f 100755 --- a/review/review-pr.sh +++ b/review/review-pr.sh @@ -52,35 +52,8 @@ REVIEW_TMPDIR=$(mktemp -d) log() { printf '[%s] PR#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" >> "$LOGFILE"; } status() { printf '[%s] PR #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" > "$STATUSFILE"; log "$*"; } - -# cleanup — remove temp files (NOT lockfile — cleanup_on_exit handles that) -cleanup() { - rm -rf "$REVIEW_TMPDIR" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json" -} - -# cleanup_on_exit — defensive cleanup: remove lockfile if we own it, kill residual children -# This handles the case where review-pr.sh is terminated unexpectedly (e.g., watchdog SIGTERM) -cleanup_on_exit() { - local ec=$? - # Remove lockfile only if we own it (PID matches $$) - if [ -f "$LOCKFILE" ] && [ -n "$(cat "$LOCKFILE" 2>/dev/null)" ]; then - if [ "$(cat "$LOCKFILE" 2>/dev/null)" = "$$" ]; then - rm -f "$LOCKFILE" - log "cleanup_on_exit: removed lockfile (we owned it)" - fi - fi - # Kill any direct children that may have been spawned by this process - # (e.g., bash -c commands from Claude's Bash tool that didn't get reaped) - pkill -P $$ 2>/dev/null || true - # Call the main cleanup function to remove temp files - cleanup - exit "$ec" -} -trap cleanup_on_exit EXIT INT TERM - -# Note: EXIT trap is already set above. The cleanup function is still available for -# non-error exits (e.g., normal completion via exit 0 after verdict posted). -# When review succeeds, we want to skip lockfile removal since the verdict was posted. +cleanup() { rm -rf "$REVIEW_TMPDIR" "$LOCKFILE" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"; } +trap cleanup EXIT # ============================================================================= # LOG ROTATION @@ -131,7 +104,6 @@ if [ "$PR_STATE" != "open" ]; then log "SKIP: state=${PR_STATE}" worktree_cleanup "$WORKTREE" rm -f "$OUTPUT_FILE" "$SID_FILE" 2>/dev/null || true - rm -f "$LOCKFILE" exit 0 fi @@ -141,7 +113,7 @@ fi CI_STATE=$(ci_commit_status "$PR_SHA") CI_NOTE="" if ! ci_passed "$CI_STATE"; then - ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; rm -f "$LOCKFILE"; exit 0; } + ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; exit 0; } CI_NOTE=" (not required — non-code PR)" fi @@ -151,10 +123,10 @@ fi ALL_COMMENTS=$(forge_api_all "/issues/${PR_NUMBER}/comments") HAS_CMT=$(printf '%s' "$ALL_COMMENTS" | jq --arg s "$PR_SHA" \ '[.[]|select(.body|contains(""))]|length') -[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; rm -f "$LOCKFILE"; exit 0; } +[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; exit 0; } HAS_FML=$(forge_api_all "/pulls/${PR_NUMBER}/reviews" | jq --arg s "$PR_SHA" \ '[.[]|select(.commit_id==$s)|select(.state!="COMMENT")]|length') -[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; rm -f "$LOCKFILE"; exit 0; } +[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; exit 0; } # ============================================================================= # RE-REVIEW DETECTION @@ -352,7 +324,3 @@ esac profile_write_journal "review-${PR_NUMBER}" "Review PR #${PR_NUMBER} (${VERDICT})" "${VERDICT,,}" "" || true log "DONE: ${VERDICT} (re-review: ${IS_RE_REVIEW})" - -# Remove lockfile on successful completion (cleanup_on_exit will also do this, -# but we do it here to avoid the trap running twice) -rm -f "$LOCKFILE" diff --git a/supervisor/AGENTS.md b/supervisor/AGENTS.md index 004c81f..23a3832 100644 --- a/supervisor/AGENTS.md +++ b/supervisor/AGENTS.md @@ -1,4 +1,4 @@ - + # Supervisor Agent **Role**: Health monitoring and auto-remediation, executed as a formula-driven diff --git a/tests/disinto-init-nomad.bats b/tests/disinto-init-nomad.bats index 54c3655..8c8b9a4 100644 --- a/tests/disinto-init-nomad.bats +++ b/tests/disinto-init-nomad.bats @@ -426,19 +426,3 @@ setup_file() { [[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent,agents"* ]] [[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents"* ]] } - -# S5.1 / #1035 — edge service seeds ops-repo (dispatcher FORGE_TOKEN) -@test "disinto init --backend=nomad --with edge deploys edge" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with edge --dry-run - [ "$status" -eq 0 ] - # edge depends on all backend services, so all are included - [[ "$output" == *"services to deploy: edge,forgejo"* ]] - [[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents staging chat edge"* ]] - [[ "$output" == *"[deploy] [dry-run] nomad job validate"*"edge.hcl"* ]] -} - -@test "disinto init --backend=nomad --with edge seeds ops-repo" { - run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with edge --dry-run - [ "$status" -eq 0 ] - [[ "$output" == *"tools/vault-seed-ops-repo.sh --dry-run"* ]] -} diff --git a/tests/smoke-edge-subpath.sh b/tests/smoke-edge-subpath.sh new file mode 100755 index 0000000..d23d06b --- /dev/null +++ b/tests/smoke-edge-subpath.sh @@ -0,0 +1,390 @@ +#!/usr/bin/env bash +# ============================================================================= +# smoke-edge-subpath.sh — End-to-end subpath routing smoke test +# +# Verifies Forgejo, Woodpecker, and chat function correctly under subpaths: +# - Forgejo at /forge/ +# - Woodpecker at /ci/ +# - Chat at /chat/ +# - Staging at /staging/ +# +# Acceptance criteria: +# 1. Forgejo login at /forge/ completes without redirect loops +# 2. Forgejo OAuth callback for Woodpecker succeeds under subpath +# 3. Woodpecker dashboard loads all assets at /ci/ (no 404s on JS/CSS) +# 4. Chat OAuth login flow works at /chat/login +# 5. Forward_auth on /chat/* rejects unauthenticated requests with 401 +# 6. Staging content loads at /staging/ +# 7. Root / redirects to /forge/ +# +# Usage: +# smoke-edge-subpath.sh [--base-url BASE_URL] +# +# Environment variables: +# BASE_URL — Edge proxy URL (default: http://localhost) +# EDGE_TIMEOUT — Request timeout in seconds (default: 30) +# EDGE_MAX_RETRIES — Max retries per request (default: 3) +# +# Exit codes: +# 0 — All checks passed +# 1 — One or more checks failed +# ============================================================================= +set -euo pipefail + +# Script directory for relative paths +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Source common helpers +source "${SCRIPT_DIR}/../lib/env.sh" 2>/dev/null || true + +# ───────────────────────────────────────────────────────────────────────────── +# Configuration +# ───────────────────────────────────────────────────────────────────────────── + +BASE_URL="${BASE_URL:-http://localhost}" +EDGE_TIMEOUT="${EDGE_TIMEOUT:-30}" +EDGE_MAX_RETRIES="${EDGE_MAX_RETRIES:-3}" + +# Subpaths to test +FORGE_PATH="/forge/" +CI_PATH="/ci/" +CHAT_PATH="/chat/" +STAGING_PATH="/staging/" + +# Track overall test status +FAILED=0 +PASSED=0 +SKIPPED=0 + +# ───────────────────────────────────────────────────────────────────────────── +# Logging helpers +# ───────────────────────────────────────────────────────────────────────────── + +log_info() { + echo "[INFO] $*" +} + +log_pass() { + echo "[PASS] $*" + ((PASSED++)) || true +} + +log_fail() { + echo "[FAIL] $*" + ((FAILED++)) || true +} + +log_skip() { + echo "[SKIP] $*" + ((SKIPPED++)) || true +} + +log_section() { + echo "" + echo "=== $* ===" + echo "" +} + +# ───────────────────────────────────────────────────────────────────────────── +# HTTP helpers +# ───────────────────────────────────────────────────────────────────────────── + +# Make an HTTP request with retry logic +# Usage: http_request [options...] +# Returns: HTTP status code on stdout, body on stderr +http_request() { + local method="$1" + local url="$2" + shift 2 + + local retries=0 + local response status + + while [ "$retries" -lt "$EDGE_MAX_RETRIES" ]; do + response=$(curl -sS -w '\n%{http_code}' -X "$method" \ + --max-time "$EDGE_TIMEOUT" \ + -o /tmp/edge-response-$$ \ + "$@" "$url" 2>&1) || { + retries=$((retries + 1)) + log_info "Retry $retries/$EDGE_MAX_RETRIES for $url" + sleep 1 + continue + } + + status=$(echo "$response" | tail -n1) + + echo "$status" + return 0 + done + + log_fail "Max retries exceeded for $url" + return 1 +} + +# Make a GET request and return status code +# Usage: http_get [curl_options...] +# Returns: HTTP status code +http_get() { + local url="$1" + shift + + http_request "GET" "$url" "$@" +} + +# Make a HEAD request (no body) +# Usage: http_head [curl_options...] +# Returns: HTTP status code +http_head() { + local url="$1" + shift + + http_request "HEAD" "$url" "$@" +} + +# ───────────────────────────────────────────────────────────────────────────── +# Test checkers +# ───────────────────────────────────────────────────────────────────────────── + +# Check if a URL returns a valid response (2xx or 3xx) +# Usage: check_http_status +check_http_status() { + local url="$1" + local expected_pattern="$2" + local description="$3" + + local status + status=$(http_get "$url") + + if echo "$status" | grep -qE "$expected_pattern"; then + log_pass "$description: $url → $status" + return 0 + else + log_fail "$description: $url → $status (expected: $expected_pattern)" + return 1 + fi +} + +# Check that a URL does NOT redirect in a loop +# Usage: check_no_redirect_loop [max_redirects] +check_no_redirect_loop() { + local url="$1" + local max_redirects="${2:-10}" + local description="$3" + + # Use curl with max redirects and check the final status + local response status follow_location + + response=$(curl -sS -w '\n%{http_code}\n%{redirect_url}' \ + --max-time "$EDGE_TIMEOUT" \ + --max-redirs "$max_redirects" \ + -o /tmp/edge-response-$$ \ + "$url" 2>&1) || { + log_fail "$description: curl failed ($?)" + return 1 + } + + status=$(echo "$response" | sed -n '$p') + follow_location=$(echo "$response" | sed -n "$((NR-1))p") + + # If we hit max redirects, the last redirect is still in follow_location + if [ "$status" = "000" ] && [ -n "$follow_location" ]; then + log_fail "$description: possible redirect loop detected (last location: $follow_location)" + return 1 + fi + + # Check final status is in valid range + if echo "$status" | grep -qE '^(2|3)[0-9][0-9]$'; then + log_pass "$description: no redirect loop ($status)" + return 0 + else + log_fail "$description: unexpected status $status" + return 1 + fi +} + +# Check that specific assets load without 404 +# Usage: check_assets_no_404 +check_assets_no_404() { + local base_url="$1" + local _pattern="$2" + local description="$3" + + local assets_found=0 + local assets_404=0 + + # Fetch the main page and extract asset URLs + local main_page + main_page=$(curl -sS --max-time "$EDGE_TIMEOUT" "$base_url" 2>/dev/null) || { + log_skip "$description: could not fetch main page" + return 0 + } + + # Extract URLs matching the pattern (e.g., .js, .css files) + local assets + assets=$(echo "$main_page" | grep -oE 'https?://[^"'"'"']+\.(js|css|woff|woff2|ttf|eot|svg|png|jpg|jpeg|gif|ico)' | sort -u || true) + + if [ -z "$assets" ]; then + log_skip "$description: no assets found to check" + return 0 + fi + + assets_found=$(echo "$assets" | wc -l) + + # Check each asset + while IFS= read -r asset; do + local status + status=$(http_head "$asset") + + if [ "$status" = "404" ]; then + log_fail "$description: asset 404: $asset" + assets_404=$((assets_404 + 1)) + fi + done <<< "$assets" + + if [ $assets_404 -eq 0 ]; then + log_pass "$description: all $assets_found assets loaded (0 404s)" + return 0 + else + log_fail "$description: $assets_404/$assets_found assets returned 404" + return 1 + fi +} + +# Check that a path returns 401 (unauthorized) +# Usage: check_returns_401 +check_returns_401() { + local url="$1" + local description="$2" + + local status + status=$(http_get "$url") + + if [ "$status" = "401" ]; then + log_pass "$description: $url → 401 (as expected)" + return 0 + else + log_fail "$description: $url → $status (expected 401)" + return 1 + fi +} + +# Check that a path returns 302 redirect to expected location +# Usage: check_redirects_to +check_redirects_to() { + local url="$1" + local expected_target="$2" + local description="$3" + + local response status location + + response=$(curl -sS -w '\n%{http_code}\n%{redirect_url}' \ + --max-time "$EDGE_TIMEOUT" \ + --max-redirs 1 \ + -o /tmp/edge-response-$$ \ + "$url" 2>&1) || { + log_fail "$description: curl failed" + return 1 + } + + status=$(echo "$response" | sed -n '$p') + location=$(echo "$response" | sed -n "$((NR-1))p") + + if [ "$status" = "302" ] && echo "$location" | grep -qF "$expected_target"; then + log_pass "$description: redirects to $location" + return 0 + else + log_fail "$description: status=$status, location=$location (expected 302 → $expected_target)" + return 1 + fi +} + +# ───────────────────────────────────────────────────────────────────────────── +# Main test suite +# ───────────────────────────────────────────────────────────────────────────── + +main() { + log_section "Edge Subpath Routing Smoke Test" + log_info "Base URL: $BASE_URL" + log_info "Timeout: ${EDGE_TIMEOUT}s, Max retries: $EDGE_MAX_RETRIES" + + # ─── Test 1: Root redirects to /forge/ ────────────────────────────────── + log_section "Test 1: Root redirects to /forge/" + + check_redirects_to "$BASE_URL" "$FORGE_PATH" "Root redirect" || FAILED=1 + if [ "$FAILED" -eq 0 ]; then ((PASSED++)) || true; fi + + # ─── Test 2: Forgejo login at /forge/ without redirect loops ──────────── + log_section "Test 2: Forgejo login at /forge/" + + check_no_redirect_loop "$BASE_URL$FORGE_PATH" 10 "Forgejo root" || FAILED=1 + check_http_status "$BASE_URL$FORGE_PATH" "^(2|3)[0-9][0-9]$" "Forgejo root status" || FAILED=1 + if [ "$FAILED" -eq 0 ]; then ((PASSED++)) || true; fi + + # ─── Test 3: Forgejo OAuth callback at /forge/_oauth/callback ─────────── + log_section "Test 3: Forgejo OAuth callback at /forge/_oauth/callback" + + check_http_status "$BASE_URL/forge/_oauth/callback" "^(2|3|4|5)[0-9][0-9]$" "Forgejo OAuth callback" || FAILED=1 + if [ "$FAILED" -eq 0 ]; then ((PASSED++)) || true; fi + + # ─── Test 4: Woodpecker dashboard at /ci/ ─────────────────────────────── + log_section "Test 4: Woodpecker dashboard at /ci/" + + check_no_redirect_loop "$BASE_URL$CI_PATH" 10 "Woodpecker root" || FAILED=1 + check_http_status "$BASE_URL$CI_PATH" "^(2|3)[0-9][0-9]$" "Woodpecker root status" || FAILED=1 + check_assets_no_404 "$BASE_URL$CI_PATH" "\.(js|css)" "Woodpecker assets" || FAILED=1 + if [ "$FAILED" -eq 0 ]; then ((PASSED++)) || true; fi + + # ─── Test 5: Chat OAuth login at /chat/login ──────────────────────────── + log_section "Test 5: Chat OAuth login at /chat/login" + + check_http_status "$BASE_URL$CHAT_PATH/login" "^(2|3)[0-9][0-9]$" "Chat login page" || FAILED=1 + if [ "$FAILED" -eq 0 ]; then ((PASSED++)) || true; fi + + # ─── Test 6: Chat OAuth callback at /chat/oauth/callback ──────────────── + log_section "Test 6: Chat OAuth callback at /chat/oauth/callback" + + check_http_status "$BASE_URL/chat/oauth/callback" "^(2|3)[0-9][0-9]$" "Chat OAuth callback" || FAILED=1 + if [ "$FAILED" -eq 0 ]; then ((PASSED++)) || true; fi + + # ─── Test 7: Forward_auth on /chat/* returns 401 for unauthenticated ──── + log_section "Test 7: Forward_auth on /chat/* returns 401" + + # Test a protected chat endpoint (chat dashboard) + check_returns_401 "$BASE_URL$CHAT_PATH/" "Chat root (unauthenticated)" || FAILED=1 + check_returns_401 "$BASE_URL$CHAT_PATH/dashboard" "Chat dashboard (unauthenticated)" || FAILED=1 + if [ "$FAILED" -eq 0 ]; then ((PASSED++)) || true; fi + + # ─── Test 8: Staging at /staging/ ─────────────────────────────────────── + log_section "Test 8: Staging at /staging/" + + check_http_status "$BASE_URL$STAGING_PATH" "^(2|3)[0-9][0-9]$" "Staging root" || FAILED=1 + if [ "$FAILED" -eq 0 ]; then ((PASSED++)) || true; fi + + # ─── Test 9: Caddy admin API health ───────────────────────────────────── + log_section "Test 9: Caddy admin API health" + + # Caddy admin API is typically on port 2019 locally + if curl -sS --max-time 5 "http://127.0.0.1:2019/" >/dev/null 2>&1; then + log_pass "Caddy admin API reachable" + ((PASSED++)) + else + log_skip "Caddy admin API not reachable (expected if edge is remote)" + fi + + # ─── Summary ──────────────────────────────────────────────────────────── + log_section "Test Summary" + log_info "Passed: $PASSED" + log_info "Failed: $FAILED" + log_info "Skipped: $SKIPPED" + + if [ $FAILED -gt 0 ]; then + log_section "TEST FAILED" + exit 1 + fi + + log_section "TEST PASSED" + exit 0 +} + +# Run main +main "$@" diff --git a/tests/smoke-init.sh b/tests/smoke-init.sh index 8cd4fee..306f7ee 100644 --- a/tests/smoke-init.sh +++ b/tests/smoke-init.sh @@ -15,7 +15,6 @@ set -euo pipefail FACTORY_ROOT="$(cd "$(dirname "$0")/.." && pwd)" -export FACTORY_ROOT_REAL="$FACTORY_ROOT" # Always use localhost for mock Forgejo (in case FORGE_URL is set from docker-compose) export FORGE_URL="http://localhost:3000" MOCK_BIN="/tmp/smoke-mock-bin" @@ -31,8 +30,7 @@ cleanup() { rm -rf "$MOCK_BIN" /tmp/smoke-test-repo \ "${FACTORY_ROOT}/projects/smoke-repo.toml" \ /tmp/smoke-claude-shared /tmp/smoke-home-claude \ - /tmp/smoke-env-before-rerun /tmp/smoke-env-before-dryrun \ - "${FACTORY_ROOT}/docker-compose.yml" + /tmp/smoke-env-before-rerun /tmp/smoke-env-before-dryrun # Restore .env only if we created the backup if [ -f "${FACTORY_ROOT}/.env.smoke-backup" ]; then mv "${FACTORY_ROOT}/.env.smoke-backup" "${FACTORY_ROOT}/.env" @@ -425,51 +423,6 @@ export CLAUDE_SHARED_DIR="$ORIG_CLAUDE_SHARED_DIR" export CLAUDE_CONFIG_DIR="$ORIG_CLAUDE_CONFIG_DIR" rm -rf /tmp/smoke-claude-shared /tmp/smoke-home-claude -# ── 8. Test duplicate service name detection ────────────────────────────── -echo "=== 8/8 Testing duplicate service name detection ===" - -# Isolated factory root — do NOT touch the real ${FACTORY_ROOT}/projects/ -SMOKE_DUP_ROOT=$(mktemp -d) -mkdir -p "$SMOKE_DUP_ROOT/projects" -cat > "$SMOKE_DUP_ROOT/projects/duplicate-test.toml" <<'TOMLEOF' -name = "duplicate-test" -description = "dup-detection smoke" - -[ci] -woodpecker_repo_id = "999" - -[agents.llama] -base_url = "http://localhost:8080" -model = "qwen:latest" -roles = ["dev"] -forge_user = "llama-bot" -TOMLEOF - -# Call the generator directly — no `disinto init` to overwrite the TOML. -# FACTORY_ROOT tells generators.sh where projects/ + compose_file live. -( - export FACTORY_ROOT="$SMOKE_DUP_ROOT" - export ENABLE_LLAMA_AGENT=1 - # shellcheck disable=SC1091 - source "${FACTORY_ROOT_REAL:-$(cd "$(dirname "$0")/.." && pwd)}/lib/generators.sh" - # Use a temp file to capture output since pipefail will kill the pipeline - # when _generate_compose_impl returns non-zero - _generate_compose_impl > /tmp/smoke-dup-output.txt 2>&1 || true - if grep -q "Duplicate service name" /tmp/smoke-dup-output.txt; then - pass "Duplicate service detection: conflict between ENABLE_LLAMA_AGENT and [agents.llama] reported" - rm -f /tmp/smoke-dup-output.txt - exit 0 - else - fail "Duplicate service detection: no error raised for ENABLE_LLAMA_AGENT + [agents.llama]" - cat /tmp/smoke-dup-output.txt >&2 - rm -f /tmp/smoke-dup-output.txt - exit 1 - fi -) || FAILED=1 - -rm -rf "$SMOKE_DUP_ROOT" -unset ENABLE_LLAMA_AGENT - # ── Summary ────────────────────────────────────────────────────────────────── echo "" if [ "$FAILED" -ne 0 ]; then diff --git a/tests/test-duplicate-service-detection.sh b/tests/test-duplicate-service-detection.sh deleted file mode 100755 index 11fde86..0000000 --- a/tests/test-duplicate-service-detection.sh +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env bash -# tests/test-duplicate-service-detection.sh — Unit test for duplicate service detection -# -# Tests that the compose generator correctly detects duplicate service names -# between ENABLE_LLAMA_AGENT=1 and [agents.llama] TOML configuration. - -set -euo pipefail - -# Get the absolute path to the disinto root -DISINTO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -TEST_DIR=$(mktemp -d) -trap "rm -rf \"\$TEST_DIR\"" EXIT - -FAILED=0 - -fail() { printf 'FAIL: %s\n' "$*" >&2; FAILED=1; } -pass() { printf 'PASS: %s\n' "$*"; } - -# Test 1: Duplicate between ENABLE_LLAMA_AGENT and [agents.llama] -echo "=== Test 1: Duplicate between ENABLE_LLAMA_AGENT and [agents.llama] ===" - -# Create projects directory and test project TOML with an agent named "llama" -mkdir -p "${TEST_DIR}/projects" -cat > "${TEST_DIR}/projects/test-project.toml" <<'TOMLEOF' -name = "test-project" -description = "Test project for duplicate detection" - -[ci] -woodpecker_repo_id = "123" - -[agents.llama] -base_url = "http://localhost:8080" -model = "qwen:latest" -roles = ["dev"] -forge_user = "llama-bot" -TOMLEOF - -# Create a minimal compose file -cat > "${TEST_DIR}/docker-compose.yml" <<'COMPOSEEOF' -# Test compose file -services: - agents: - image: test:latest - command: echo "hello" - -volumes: - test-data: - -networks: - test-net: -COMPOSEEOF - -# Set up the test environment -export FACTORY_ROOT="${TEST_DIR}" -export PROJECT_NAME="test-project" -export ENABLE_LLAMA_AGENT="1" -export FORGE_TOKEN="" -export FORGE_PASS="" -export CLAUDE_TIMEOUT="7200" -export POLL_INTERVAL="300" -export GARDENER_INTERVAL="21600" -export ARCHITECT_INTERVAL="21600" -export PLANNER_INTERVAL="43200" -export SUPERVISOR_INTERVAL="1200" - -# Source the generators module and run the compose generator directly -source "${DISINTO_ROOT}/lib/generators.sh" - -# Delete the compose file to force regeneration -rm -f "${TEST_DIR}/docker-compose.yml" - -# Run the compose generator directly -if _generate_compose_impl 3000 false 2>&1 | tee "${TEST_DIR}/output.txt"; then - # Check if the output contains the duplicate error message - if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output.txt"; then - pass "Duplicate detection: correctly detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]" - else - fail "Duplicate detection: should have detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]" - cat "${TEST_DIR}/output.txt" >&2 - fi -else - # Generator should fail with non-zero exit code - if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output.txt"; then - pass "Duplicate detection: correctly detected conflict and returned non-zero exit code" - else - fail "Duplicate detection: should have failed with duplicate error" - cat "${TEST_DIR}/output.txt" >&2 - fi -fi - -# Test 2: No duplicate when only ENABLE_LLAMA_AGENT is set (no conflicting TOML) -echo "" -echo "=== Test 2: No duplicate when only ENABLE_LLAMA_AGENT is set ===" - -# Remove the projects directory created in Test 1 -rm -rf "${TEST_DIR}/projects" - -# Create a fresh compose file -cat > "${TEST_DIR}/docker-compose.yml" <<'COMPOSEEOF' -# Test compose file -services: - agents: - image: test:latest - -volumes: - test-data: - -networks: - test-net: -COMPOSEEOF - -# Set ENABLE_LLAMA_AGENT -export ENABLE_LLAMA_AGENT="1" - -# Delete the compose file to force regeneration -rm -f "${TEST_DIR}/docker-compose.yml" - -if _generate_compose_impl 3000 false 2>&1 | tee "${TEST_DIR}/output2.txt"; then - if grep -q "Duplicate" "${TEST_DIR}/output2.txt"; then - fail "No duplicate: should not detect duplicate when only ENABLE_LLAMA_AGENT is set" - else - pass "No duplicate: correctly generated compose without duplicates" - fi -else - # Non-zero exit is fine if there's a legitimate reason (e.g., missing files) - if grep -q "Duplicate" "${TEST_DIR}/output2.txt"; then - fail "No duplicate: should not detect duplicate when only ENABLE_LLAMA_AGENT is set" - else - pass "No duplicate: generator failed for other reason (acceptable)" - fi -fi - -# Test 3: Duplicate between two TOML agents with same name -echo "" -echo "=== Test 3: Duplicate between two TOML agents with same name ===" - -rm -f "${TEST_DIR}/docker-compose.yml" - -# Create projects directory for Test 3 -mkdir -p "${TEST_DIR}/projects" - -cat > "${TEST_DIR}/projects/project1.toml" <<'TOMLEOF' -name = "project1" -description = "First project" - -[ci] -woodpecker_repo_id = "1" - -[agents.llama] -base_url = "http://localhost:8080" -model = "qwen:latest" -roles = ["dev"] -forge_user = "llama-bot1" -TOMLEOF - -cat > "${TEST_DIR}/projects/project2.toml" <<'TOMLEOF' -name = "project2" -description = "Second project" - -[ci] -woodpecker_repo_id = "2" - -[agents.llama] -base_url = "http://localhost:8080" -model = "qwen:latest" -roles = ["dev"] -forge_user = "llama-bot2" -TOMLEOF - -cat > "${TEST_DIR}/docker-compose.yml" <<'COMPOSEEOF' -# Test compose file -services: - agents: - image: test:latest - -volumes: - test-data: - -networks: - test-net: -COMPOSEEOF - -unset ENABLE_LLAMA_AGENT - -# Delete the compose file to force regeneration -rm -f "${TEST_DIR}/docker-compose.yml" - -if _generate_compose_impl 3000 false 2>&1 | tee "${TEST_DIR}/output3.txt"; then - if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output3.txt"; then - pass "Duplicate detection: correctly detected conflict between two [agents.llama] blocks" - else - fail "Duplicate detection: should have detected conflict between two [agents.llama] blocks" - cat "${TEST_DIR}/output3.txt" >&2 - fi -else - if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output3.txt"; then - pass "Duplicate detection: correctly detected conflict and returned non-zero exit code" - else - fail "Duplicate detection: should have failed with duplicate error" - cat "${TEST_DIR}/output3.txt" >&2 - fi -fi - -# Summary -echo "" -if [ "$FAILED" -ne 0 ]; then - echo "=== TESTS FAILED ===" - exit 1 -fi -echo "=== ALL TESTS PASSED ===" diff --git a/tests/test-watchdog-process-group.sh b/tests/test-watchdog-process-group.sh deleted file mode 100755 index 54fedf9..0000000 --- a/tests/test-watchdog-process-group.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env bash -# test-watchdog-process-group.sh — Test that claude_run_with_watchdog kills orphan children -# -# This test verifies that when claude_run_with_watchdog terminates the Claude process, -# all child processes (including those spawned by Claude's Bash tool) are also killed. -# -# Reproducer scenario: -# 1. Create a fake "claude" stub that: -# a. Spawns a long-running child process (sleep 3600) -# b. Writes a result marker to stdout to trigger idle detection -# c. Stays running -# 2. Run claude_run_with_watchdog with the stub -# 3. Before the fix: sleep child survives (orphaned to PID 1) -# 4. After the fix: sleep child dies (killed as part of process group with -PID) -# -# Usage: ./tests/test-watchdog-process-group.sh - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" -TEST_TMP="/tmp/test-watchdog-$$" -LOGFILE="${TEST_TMP}/log.txt" -PASS=true - -# shellcheck disable=SC2317 -cleanup_test() { - rm -rf "$TEST_TMP" -} -trap cleanup_test EXIT INT TERM - -mkdir -p "$TEST_TMP" - -log() { - printf '[TEST] %s\n' "$*" | tee -a "$LOGFILE" -} - -fail() { - printf '[TEST] FAIL: %s\n' "$*" | tee -a "$LOGFILE" - PASS=false -} - -pass() { - printf '[TEST] PASS: %s\n' "$*" | tee -a "$LOGFILE" -} - -# Export required environment variables -export CLAUDE_TIMEOUT=10 # Short timeout for testing -export CLAUDE_IDLE_GRACE=2 # Short grace period for testing -export LOGFILE="${LOGFILE}" # Required by agent-sdk.sh - -# Create a fake claude stub that: -# 1. Spawns a long-running child process (sleep 3600) that will become an orphan if parent is killed -# 2. Writes a result marker to stdout (to trigger the watchdog's idle-after-result path) -# 3. Stays running so the watchdog can kill it -cat > "${TEST_TMP}/fake-claude" << 'FAKE_CLAUDE_EOF' -#!/usr/bin/env bash -# Fake claude that spawns a child and stays running -# Simulates Claude's behavior when it spawns a Bash tool command - -# Write result marker to stdout (triggers watchdog idle detection) -echo '{"type":"result","session_id":"test-session-123","verdict":"APPROVE"}' - -# Spawn a child that simulates Claude's Bash tool hanging -# This is the process that should be killed when the parent is terminated -sleep 3600 & -CHILD_PID=$! - -# Log the child PID for debugging -echo "FAKE_CLAUDE_CHILD_PID=$CHILD_PID" >&2 - -# Stay running - sleep in a loop so the watchdog can kill us -while true; do - sleep 3600 & - wait $! 2>/dev/null || true -done -FAKE_CLAUDE_EOF -chmod +x "${TEST_TMP}/fake-claude" - -log "Testing claude_run_with_watchdog process group cleanup..." - -# Source the library and run claude_run_with_watchdog -cd "$SCRIPT_DIR" -source lib/agent-sdk.sh - -log "Starting claude_run_with_watchdog with fake claude..." - -# Run the function directly (not as a script) -# We need to capture output and redirect stderr -OUTPUT_FILE="${TEST_TMP}/output.txt" -timeout 35 bash -c " - source '${SCRIPT_DIR}/lib/agent-sdk.sh' - CLAUDE_TIMEOUT=10 CLAUDE_IDLE_GRACE=2 LOGFILE='${LOGFILE}' claude_run_with_watchdog '${TEST_TMP}/fake-claude' > '${OUTPUT_FILE}' 2>&1 - exit \$? -" || true - -# Give the watchdog a moment to clean up -log "Waiting for cleanup..." -sleep 5 - -# More precise check: look for sleep 3600 processes -# These would be the orphans from our fake claude -ORPHAN_COUNT=$(pgrep -a sleep 2>/dev/null | grep -c "sleep 3600" 2>/dev/null || echo "0") - -if [ "$ORPHAN_COUNT" -gt 0 ]; then - log "Found $ORPHAN_COUNT orphan sleep 3600 processes:" - pgrep -a sleep | grep "sleep 3600" - fail "Orphan children found - process group cleanup did not work" -else - pass "No orphan children found - process group cleanup worked" -fi - -# Also verify that the fake claude itself is not running -FAKE_CLAUDE_COUNT=$(pgrep -c -f "fake-claude" 2>/dev/null || echo "0") -if [ "$FAKE_CLAUDE_COUNT" -gt 0 ]; then - log "Found $FAKE_CLAUDE_COUNT fake-claude processes still running" - fail "Fake claude process(es) still running" -else - pass "Fake claude process terminated" -fi - -# Summary -echo "" -if [ "$PASS" = true ]; then - log "All tests passed!" - exit 0 -else - log "Some tests failed. See log at $LOGFILE" - exit 1 -fi diff --git a/tools/vault-seed-ops-repo.sh b/tools/vault-seed-ops-repo.sh deleted file mode 100755 index 09a2fba..0000000 --- a/tools/vault-seed-ops-repo.sh +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# tools/vault-seed-ops-repo.sh — Idempotent seed for kv/disinto/shared/ops-repo -# -# Part of the Nomad+Vault migration (S5.1, issue #1035). Populates the KV v2 -# path that nomad/jobs/edge.hcl dispatcher task reads from, so the edge -# proxy has FORGE_TOKEN for ops repo access. -# -# Seeds from kv/disinto/bots/vault (the vault bot credentials) — copies the -# token field to kv/disinto/shared/ops-repo. This is the "service" path that -# dispatcher uses, distinct from the "agent" path (bots/vault) used by -# agent tasks under the service-agents policy. -# -# Idempotency contract: -# - Key present with non-empty value → leave untouched, log "token unchanged". -# - Key missing or empty → copy from bots/vault, log "token copied". -# - If bots/vault is also empty → generate a random value, log "token generated". -# -# Preconditions: -# - Vault reachable + unsealed at $VAULT_ADDR. -# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable. -# - The `kv/` mount is enabled as KV v2. -# -# Requires: -# - VAULT_ADDR (e.g. http://127.0.0.1:8200) -# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh) -# - curl, jq, openssl -# -# Usage: -# tools/vault-seed-ops-repo.sh -# tools/vault-seed-ops-repo.sh --dry-run -# -# Exit codes: -# 0 success (seed applied, or already applied) -# 1 precondition / API / mount-mismatch failure -# ============================================================================= -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" - -# shellcheck source=../lib/hvault.sh -source "${REPO_ROOT}/lib/hvault.sh" - -# KV v2 mount + logical paths -KV_MOUNT="kv" -OPS_REPO_PATH="disinto/shared/ops-repo" -VAULT_BOT_PATH="disinto/bots/vault" - -OPS_REPO_API="${KV_MOUNT}/data/${OPS_REPO_PATH}" -VAULT_BOT_API="${KV_MOUNT}/data/${VAULT_BOT_PATH}" - -log() { printf '[vault-seed-ops-repo] %s\n' "$*"; } -die() { printf '[vault-seed-ops-repo] ERROR: %s\n' "$*" >&2; exit 1; } - -# ── Flag parsing ───────────────────────────────────────────────────────────── -DRY_RUN=0 -case "$#:${1-}" in - 0:) - ;; - 1:--dry-run) - DRY_RUN=1 - ;; - 1:-h|1:--help) - printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" - printf 'Seed kv/disinto/shared/ops-repo with FORGE_TOKEN.\n\n' - printf 'Copies token from kv/disinto/bots/vault if present;\n' - printf 'otherwise generates a random value. Idempotent:\n' - printf 'existing non-empty values are left untouched.\n\n' - printf ' --dry-run Print planned actions without writing.\n' - exit 0 - ;; - *) - die "invalid arguments: $* (try --help)" - ;; -esac - -# ── Preconditions ──────────────────────────────────────────────────────────── -for bin in curl jq openssl; do - command -v "$bin" >/dev/null 2>&1 \ - || die "required binary not found: ${bin}" -done - -[ -n "${VAULT_ADDR:-}" ] \ - || die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200" -hvault_token_lookup >/dev/null \ - || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" - -# ── Step 1/2: ensure kv/ mount exists and is KV v2 ─────────────────────────── -log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──" -export DRY_RUN -hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-ops-repo]" \ - || die "KV mount check failed" - -# ── Step 2/2: seed ops-repo from vault bot ─────────────────────────────────── -log "── Step 2/2: seed ${OPS_REPO_API} ──" - -# Read existing ops-repo value -existing_raw="$(hvault_get_or_empty "${OPS_REPO_API}")" \ - || die "failed to read ${OPS_REPO_API}" - -existing_token="" -if [ -n "$existing_raw" ]; then - existing_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.token // ""')" -fi - -desired_token="$existing_token" -action="" - -if [ -z "$existing_token" ]; then - # Token missing — try to copy from vault bot - bot_raw="$(hvault_get_or_empty "${VAULT_BOT_API}")" || true - if [ -n "$bot_raw" ]; then - bot_token="$(printf '%s' "$bot_raw" | jq -r '.data.data.token // ""')" - if [ -n "$bot_token" ]; then - desired_token="$bot_token" - action="copied" - fi - fi - - # If still no token, generate one - if [ -z "$desired_token" ]; then - if [ "$DRY_RUN" -eq 1 ]; then - action="generated (dry-run)" - else - desired_token="$(openssl rand -hex 32)" - action="generated" - fi - fi -fi - -if [ -z "$action" ]; then - log "all keys present at ${OPS_REPO_API} — no-op" - log "token unchanged" - exit 0 -fi - -if [ "$DRY_RUN" -eq 1 ]; then - log "[dry-run] ${OPS_REPO_PATH}: would ${action} token" - exit 0 -fi - -# Write the token -payload="$(jq -n --arg t "$desired_token" '{data: {token: $t}}')" -_hvault_request POST "${OPS_REPO_API}" "$payload" >/dev/null \ - || die "failed to write ${OPS_REPO_API}" - -log "${OPS_REPO_PATH}: ${action} token" -log "done — ${OPS_REPO_API} seeded" diff --git a/vault/policies/AGENTS.md b/vault/policies/AGENTS.md index 47af340..9a4b588 100644 --- a/vault/policies/AGENTS.md +++ b/vault/policies/AGENTS.md @@ -1,4 +1,4 @@ - + # vault/policies/ — Agent Instructions HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per