Compare commits
33 commits
fix/issue-
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 3aa521509a | |||
|
|
2c7c8d0b38 | ||
| ec4e608827 | |||
|
|
cb8c131bc4 | ||
|
|
c287ec0626 | ||
| 449611e6df | |||
| 9f365e40c0 | |||
|
|
e90ff4eb7b | ||
| 441e2a366d | |||
|
|
f878427866 | ||
|
|
0f91efc478 | ||
|
|
1170ecb2f0 | ||
| e9aed747b5 | |||
|
|
d1c7f4573a | ||
|
|
42807903ef | ||
|
|
1e1acd50ab | ||
| 9cc12f2303 | |||
| 072d352c1c | |||
|
|
78f4966d0c | ||
|
|
ca8079ae70 | ||
| 5ba18c8f80 | |||
|
|
1c0ec3c7ec | ||
| eb19aa6c84 | |||
|
|
86793c4c00 | ||
| 0bb04545d4 | |||
| 1de3b0d560 | |||
|
|
d1e535696a | ||
|
|
ada27759de | ||
|
|
2648c401f4 | ||
| b09463b162 | |||
|
|
72f981528d | ||
|
|
cd778c4775 | ||
|
|
bf3d16e8b3 |
32 changed files with 1337 additions and 112 deletions
|
|
@ -294,6 +294,10 @@ def main() -> int:
|
||||||
"9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern",
|
"9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern",
|
||||||
# Standard lib source block shared across formula-driven agent run scripts
|
# Standard lib source block shared across formula-driven agent run scripts
|
||||||
"330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)",
|
"330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)",
|
||||||
|
# Test data for duplicate service detection tests (#850)
|
||||||
|
# Intentionally duplicated TOML blocks in smoke-init.sh and test-duplicate-service-detection.sh
|
||||||
|
"334967b8b4f1a8d3b0b9b8e0912f3bfb": "Test TOML: [agents.llama] block header (smoke-init.sh + test-duplicate-service-detection.sh)",
|
||||||
|
"d82f30077e5bb23b5fc01db003033d5d": "Test TOML: [agents.llama] block body (smoke-init.sh + test-duplicate-service-detection.sh)",
|
||||||
# Common vault-seed script patterns: logging helpers + flag parsing
|
# Common vault-seed script patterns: logging helpers + flag parsing
|
||||||
# Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh
|
# Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh
|
||||||
"843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)",
|
"843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)",
|
||||||
|
|
@ -308,6 +312,21 @@ def main() -> int:
|
||||||
"63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die",
|
"63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die",
|
||||||
"34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup",
|
"34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup",
|
||||||
"71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die",
|
"71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die",
|
||||||
|
# Common vault-seed script flag parsing patterns
|
||||||
|
# Shared across tools/vault-seed-{forgejo,ops-repo}.sh
|
||||||
|
"6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)",
|
||||||
|
"a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)",
|
||||||
|
"e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)",
|
||||||
|
"c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)",
|
||||||
|
"106f4071e88f841b3208b01144cd1c39": "vault-seed case pattern dry-run end (forgejo + ops-repo)",
|
||||||
|
"c15506dcb6bb340b25d1c39d442dd2e6": "vault-seed help text + invalid arg handler (forgejo + ops-repo)",
|
||||||
|
"1feecd3b3caf00045fae938ddf2811de": "vault-seed invalid arg handler (forgejo + ops-repo)",
|
||||||
|
"919780d5e7182715344f5aa02b191294": "vault-seed invalid arg + esac pattern (forgejo + ops-repo)",
|
||||||
|
"8dce1d292bce8e60ef4c0665b62945b0": "vault-seed esac + binary check loop (forgejo + ops-repo)",
|
||||||
|
"ca043687143a5b47bd54e65a99ce8ee8": "vault-seed binary check loop start (forgejo + ops-repo)",
|
||||||
|
"aefd9f655411a955395e6e5995ddbe6f": "vault-seed binary check pattern (forgejo + ops-repo)",
|
||||||
|
"60f0c46deb5491599457efb4048918e5": "vault-seed VAULT_ADDR + hvault_token_lookup check (forgejo + ops-repo)",
|
||||||
|
"f6838f581ef6b4d82b55268389032769": "vault-seed VAULT_ADDR + hvault_token_lookup die (forgejo + ops-repo)",
|
||||||
}
|
}
|
||||||
|
|
||||||
if not sh_files:
|
if not sh_files:
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# Disinto — Agent Instructions
|
# Disinto — Agent Instructions
|
||||||
|
|
||||||
## What this repo is
|
## What this repo is
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# Architect — Agent Instructions
|
# Architect — Agent Instructions
|
||||||
|
|
||||||
## What this agent is
|
## What this agent is
|
||||||
|
|
|
||||||
23
bin/disinto
23
bin/disinto
|
|
@ -12,6 +12,7 @@
|
||||||
# disinto secrets <subcommand> Manage encrypted secrets
|
# disinto secrets <subcommand> Manage encrypted secrets
|
||||||
# disinto run <action-id> Run action in ephemeral runner container
|
# disinto run <action-id> Run action in ephemeral runner container
|
||||||
# disinto ci-logs <pipeline> [--step <name>] Read CI logs from Woodpecker SQLite
|
# disinto ci-logs <pipeline> [--step <name>] Read CI logs from Woodpecker SQLite
|
||||||
|
# disinto backup create <outfile> Export factory state for migration
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# disinto init https://github.com/user/repo
|
# disinto init https://github.com/user/repo
|
||||||
|
|
@ -39,6 +40,7 @@ source "${FACTORY_ROOT}/lib/generators.sh"
|
||||||
source "${FACTORY_ROOT}/lib/forge-push.sh"
|
source "${FACTORY_ROOT}/lib/forge-push.sh"
|
||||||
source "${FACTORY_ROOT}/lib/ci-setup.sh"
|
source "${FACTORY_ROOT}/lib/ci-setup.sh"
|
||||||
source "${FACTORY_ROOT}/lib/release.sh"
|
source "${FACTORY_ROOT}/lib/release.sh"
|
||||||
|
source "${FACTORY_ROOT}/lib/backup.sh"
|
||||||
source "${FACTORY_ROOT}/lib/claude-config.sh"
|
source "${FACTORY_ROOT}/lib/claude-config.sh"
|
||||||
|
|
||||||
# ── Helpers ──────────────────────────────────────────────────────────────────
|
# ── Helpers ──────────────────────────────────────────────────────────────────
|
||||||
|
|
@ -62,6 +64,7 @@ Usage:
|
||||||
disinto hire-an-agent <agent-name> <role> [--formula <path>] [--local-model <url>] [--model <name>]
|
disinto hire-an-agent <agent-name> <role> [--formula <path>] [--local-model <url>] [--model <name>]
|
||||||
Hire a new agent (create user + .profile repo; re-run to rotate credentials)
|
Hire a new agent (create user + .profile repo; re-run to rotate credentials)
|
||||||
disinto agent <subcommand> Manage agent state (enable/disable)
|
disinto agent <subcommand> Manage agent state (enable/disable)
|
||||||
|
disinto backup create <outfile> Export factory state (issues + ops bundle)
|
||||||
disinto edge <verb> [options] Manage edge tunnel registrations
|
disinto edge <verb> [options] Manage edge tunnel registrations
|
||||||
|
|
||||||
Edge subcommands:
|
Edge subcommands:
|
||||||
|
|
@ -802,6 +805,7 @@ _disinto_init_nomad() {
|
||||||
woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;;
|
woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;;
|
||||||
agents) seed_name="agents" ;;
|
agents) seed_name="agents" ;;
|
||||||
chat) seed_name="chat" ;;
|
chat) seed_name="chat" ;;
|
||||||
|
edge) seed_name="ops-repo" ;;
|
||||||
esac
|
esac
|
||||||
local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh"
|
local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh"
|
||||||
if [ -x "$seed_script" ]; then
|
if [ -x "$seed_script" ]; then
|
||||||
|
|
@ -983,6 +987,7 @@ _disinto_init_nomad() {
|
||||||
woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;;
|
woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;;
|
||||||
agents) seed_name="agents" ;;
|
agents) seed_name="agents" ;;
|
||||||
chat) seed_name="chat" ;;
|
chat) seed_name="chat" ;;
|
||||||
|
edge) seed_name="ops-repo" ;;
|
||||||
esac
|
esac
|
||||||
local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh"
|
local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh"
|
||||||
if [ -x "$seed_script" ]; then
|
if [ -x "$seed_script" ]; then
|
||||||
|
|
@ -2891,6 +2896,23 @@ EOF
|
||||||
esac
|
esac
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── backup command ────────────────────────────────────────────────────────────
|
||||||
|
# Usage: disinto backup create <outfile.tar.gz>
|
||||||
|
disinto_backup() {
|
||||||
|
local subcmd="${1:-}"
|
||||||
|
shift || true
|
||||||
|
|
||||||
|
case "$subcmd" in
|
||||||
|
create)
|
||||||
|
backup_create "$@"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Usage: disinto backup create <outfile.tar.gz>" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
# ── Main dispatch ────────────────────────────────────────────────────────────
|
# ── Main dispatch ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
case "${1:-}" in
|
case "${1:-}" in
|
||||||
|
|
@ -2907,6 +2929,7 @@ case "${1:-}" in
|
||||||
hire-an-agent) shift; disinto_hire_an_agent "$@" ;;
|
hire-an-agent) shift; disinto_hire_an_agent "$@" ;;
|
||||||
agent) shift; disinto_agent "$@" ;;
|
agent) shift; disinto_agent "$@" ;;
|
||||||
edge) shift; disinto_edge "$@" ;;
|
edge) shift; disinto_edge "$@" ;;
|
||||||
|
backup) shift; disinto_backup "$@" ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
*) usage ;;
|
*) usage ;;
|
||||||
esac
|
esac
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# Dev Agent
|
# Dev Agent
|
||||||
|
|
||||||
**Role**: Implement issues autonomously — write code, push branches, address
|
**Role**: Implement issues autonomously — write code, push branches, address
|
||||||
|
|
|
||||||
|
|
@ -173,11 +173,15 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}"
|
||||||
sleep 1200 # 20 minutes
|
sleep 1200 # 20 minutes
|
||||||
done) &
|
done) &
|
||||||
|
|
||||||
# ── Load required secrets from secrets/*.enc (#777) ────────────────────
|
# ── Load optional secrets from secrets/*.enc (#777) ────────────────────
|
||||||
# Edge container declares its required secrets; missing ones cause a hard fail.
|
# Engagement collection (collect-engagement.sh) requires CADDY_ secrets to
|
||||||
|
# SCP access logs from a remote edge host. When age key or secrets dir is
|
||||||
|
# missing, or any secret fails to decrypt, log a warning and skip the cron.
|
||||||
|
# Caddy itself does not depend on these secrets.
|
||||||
_AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt"
|
_AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt"
|
||||||
_SECRETS_DIR="/opt/disinto/secrets"
|
_SECRETS_DIR="/opt/disinto/secrets"
|
||||||
EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG"
|
EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG"
|
||||||
|
EDGE_ENGAGEMENT_READY=0 # Assume not ready until proven otherwise
|
||||||
|
|
||||||
_edge_decrypt_secret() {
|
_edge_decrypt_secret() {
|
||||||
local enc_path="${_SECRETS_DIR}/${1}.enc"
|
local enc_path="${_SECRETS_DIR}/${1}.enc"
|
||||||
|
|
@ -192,22 +196,25 @@ if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then
|
||||||
export "$_secret_name=$_val"
|
export "$_secret_name=$_val"
|
||||||
done
|
done
|
||||||
if [ -n "$_missing" ]; then
|
if [ -n "$_missing" ]; then
|
||||||
echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2
|
echo "WARN: required engagement secrets missing from secrets/*.enc:${_missing}" >&2
|
||||||
echo " Run 'disinto secrets add <NAME>' for each missing secret." >&2
|
echo " collect-engagement cron will be skipped. Run 'disinto secrets add <NAME>' to enable." >&2
|
||||||
echo " If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2
|
EDGE_ENGAGEMENT_READY=0
|
||||||
exit 1
|
else
|
||||||
|
echo "edge: loaded required engagement secrets: ${EDGE_REQUIRED_SECRETS}" >&2
|
||||||
|
EDGE_ENGAGEMENT_READY=1
|
||||||
fi
|
fi
|
||||||
echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2
|
|
||||||
else
|
else
|
||||||
echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2
|
echo "WARN: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — engagement secrets unavailable" >&2
|
||||||
echo " Ensure age is installed and secrets/*.enc files are present." >&2
|
echo " collect-engagement cron will be skipped. Run 'disinto secrets add <NAME>' to enable." >&2
|
||||||
exit 1
|
EDGE_ENGAGEMENT_READY=0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Start daily engagement collection cron loop in background (#745)
|
# Start daily engagement collection cron loop in background (#745)
|
||||||
# Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that
|
# Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that
|
||||||
# calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777).
|
# calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777).
|
||||||
(while true; do
|
# Guarded: only start if EDGE_ENGAGEMENT_READY=1.
|
||||||
|
if [ "$EDGE_ENGAGEMENT_READY" -eq 1 ]; then
|
||||||
|
(while true; do
|
||||||
# Calculate seconds until next 23:50 UTC
|
# Calculate seconds until next 23:50 UTC
|
||||||
_now=$(date -u +%s)
|
_now=$(date -u +%s)
|
||||||
_target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0)
|
_target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0)
|
||||||
|
|
@ -232,7 +239,10 @@ fi
|
||||||
echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2
|
echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2
|
||||||
fi
|
fi
|
||||||
rm -f "$_fetch_log"
|
rm -f "$_fetch_log"
|
||||||
done) &
|
done) &
|
||||||
|
else
|
||||||
|
echo "edge: collect-engagement cron skipped (EDGE_ENGAGEMENT_READY=0)" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
# Nomad template renders Caddyfile to /local/Caddyfile via service discovery;
|
# Nomad template renders Caddyfile to /local/Caddyfile via service discovery;
|
||||||
# copy it into the expected location if present (compose uses the mounted path).
|
# copy it into the expected location if present (compose uses the mounted path).
|
||||||
|
|
|
||||||
183
docs/nomad-cutover-runbook.md
Normal file
183
docs/nomad-cutover-runbook.md
Normal file
|
|
@ -0,0 +1,183 @@
|
||||||
|
# Nomad Cutover Runbook
|
||||||
|
|
||||||
|
End-to-end procedure to cut over the disinto factory from docker-compose on
|
||||||
|
disinto-dev-box to Nomad on disinto-nomad-box.
|
||||||
|
|
||||||
|
**Target**: disinto-nomad-box (10.10.10.216) becomes production; disinto-dev-box
|
||||||
|
stays warm for rollback.
|
||||||
|
|
||||||
|
**Downtime budget**: <5 min blue-green flip.
|
||||||
|
|
||||||
|
**Data scope**: Forgejo issues + disinto-ops git bundle only. Everything else is
|
||||||
|
regenerated or discarded. OAuth secrets are regenerated on fresh init (all
|
||||||
|
sessions invalidated).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Pre-cutover readiness checklist
|
||||||
|
|
||||||
|
- [ ] Nomad + Vault stack healthy on a fresh wipe+init (step 5 verified)
|
||||||
|
- [ ] Codeberg mirror current — `git log` parity between dev-box Forgejo and
|
||||||
|
Codeberg
|
||||||
|
- [ ] SSH key pair generated for nomad-box, registered on DO edge (see §4.6)
|
||||||
|
- [ ] Companion tools landed:
|
||||||
|
- `disinto backup create` (#1057)
|
||||||
|
- `disinto backup import` (#1058)
|
||||||
|
- [ ] Backup tarball produced and tested against a scratch LXC (see §3)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Pre-cutover artifact: backup
|
||||||
|
|
||||||
|
On disinto-dev-box:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./bin/disinto backup create /tmp/disinto-backup-$(date +%Y%m%d).tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
Copy the tarball to nomad-box (and optionally to a local workstation for
|
||||||
|
safekeeping):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
scp /tmp/disinto-backup-*.tar.gz nomad-box:/tmp/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Pre-cutover dry-run
|
||||||
|
|
||||||
|
On a throwaway LXC:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
lxc launch ubuntu:24.04 cutover-dryrun
|
||||||
|
# inside the container:
|
||||||
|
disinto init --backend=nomad --import-env .env --with edge
|
||||||
|
./bin/disinto backup import /tmp/disinto-backup-*.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify:
|
||||||
|
|
||||||
|
- Issue count matches source Forgejo
|
||||||
|
- disinto-ops repo refs match source bundle
|
||||||
|
|
||||||
|
Destroy the LXC once satisfied:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
lxc delete cutover-dryrun --force
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Cutover T-0 (operator executes; <5 min target)
|
||||||
|
|
||||||
|
### 4.1 Stop dev-box services
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On disinto-dev-box — stop, do NOT remove volumes (rollback needs them)
|
||||||
|
docker-compose stop
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.2 Provision nomad-box (if not already done)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On disinto-nomad-box
|
||||||
|
disinto init --backend=nomad --import-env .env --with edge
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.3 Import backup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On disinto-nomad-box
|
||||||
|
./bin/disinto backup import /tmp/disinto-backup-*.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.4 Configure Codeberg pull mirror
|
||||||
|
|
||||||
|
Manual, one-time step in the new Forgejo UI:
|
||||||
|
|
||||||
|
1. Create a mirror repository pointing at the Codeberg upstream
|
||||||
|
2. Confirm initial sync completes
|
||||||
|
|
||||||
|
### 4.5 Claude login
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On disinto-nomad-box
|
||||||
|
claude login
|
||||||
|
```
|
||||||
|
|
||||||
|
Set up Anthropic OAuth so agents can authenticate.
|
||||||
|
|
||||||
|
### 4.6 Autossh tunnel swap
|
||||||
|
|
||||||
|
> **Operator step** — cross-host, no dev-agent involvement. Do NOT automate.
|
||||||
|
|
||||||
|
1. Stop the tunnel on dev-box:
|
||||||
|
```bash
|
||||||
|
# On disinto-dev-box
|
||||||
|
systemctl stop reverse-tunnel
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Copy or regenerate the tunnel unit on nomad-box:
|
||||||
|
```bash
|
||||||
|
# Copy from dev-box, or let init regenerate it
|
||||||
|
scp dev-box:/etc/systemd/system/reverse-tunnel.service \
|
||||||
|
nomad-box:/etc/systemd/system/
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Register nomad-box's public key on DO edge:
|
||||||
|
```bash
|
||||||
|
# On DO edge box — same restricted-command as the dev-box key
|
||||||
|
echo "<nomad-box-pubkey>" >> /home/johba/.ssh/authorized_keys
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Start the tunnel on nomad-box:
|
||||||
|
```bash
|
||||||
|
# On disinto-nomad-box
|
||||||
|
systemctl enable --now reverse-tunnel
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Verify end-to-end:
|
||||||
|
```bash
|
||||||
|
curl https://self.disinto.ai/api/v1/version
|
||||||
|
# Should return the new box's Forgejo version
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Post-cutover smoke
|
||||||
|
|
||||||
|
- [ ] `curl https://self.disinto.ai` → Forgejo welcome page
|
||||||
|
- [ ] Create a test PR → Woodpecker pipeline runs → agents assign and work
|
||||||
|
- [ ] Claude chat login via Forgejo OAuth succeeds
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Rollback (if any step 4 gate fails)
|
||||||
|
|
||||||
|
1. Stop the tunnel on nomad-box:
|
||||||
|
```bash
|
||||||
|
systemctl stop reverse-tunnel # on nomad-box
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Restore the tunnel on dev-box:
|
||||||
|
```bash
|
||||||
|
systemctl start reverse-tunnel # on dev-box
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Bring dev-box services back up:
|
||||||
|
```bash
|
||||||
|
docker-compose up -d # on dev-box
|
||||||
|
```
|
||||||
|
|
||||||
|
4. DO Caddy config is unchanged — traffic restores in <5 min.
|
||||||
|
|
||||||
|
5. File a post-mortem issue. Keep nomad-box state intact for debugging.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Post-stable cleanup (T+1 week)
|
||||||
|
|
||||||
|
- `docker-compose down -v` on dev-box
|
||||||
|
- Archive `/var/lib/docker/volumes/disinto_*` to cold storage
|
||||||
|
- Delete disinto-dev-box LXC or keep as permanent rollback reserve (operator
|
||||||
|
decision)
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# Gardener Agent
|
# Gardener Agent
|
||||||
|
|
||||||
**Role**: Backlog grooming — detect duplicate issues, missing acceptance
|
**Role**: Backlog grooming — detect duplicate issues, missing acceptance
|
||||||
|
|
|
||||||
|
|
@ -1 +0,0 @@
|
||||||
{"issue":850,"group":"lib/generators.sh","title":"compose dup-detection smoke CI failures","reason":"4+ consecutive ci_exhausted failures across PRs #872 #908 #971; planner flagged for human re-scope","ts":"2026-04-19T00:00:00Z"}
|
|
||||||
|
|
@ -1,8 +1,23 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"action": "edit_body",
|
"action": "add_label",
|
||||||
|
"issue": 1047,
|
||||||
|
"label": "backlog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action": "add_label",
|
||||||
|
"issue": 1047,
|
||||||
|
"label": "priority"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action": "add_label",
|
||||||
|
"issue": 1044,
|
||||||
|
"label": "backlog"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action": "remove_label",
|
||||||
"issue": 1025,
|
"issue": 1025,
|
||||||
"body": "## Goal\nVerify that Forgejo, Woodpecker, and chat all function correctly when served\nunder /forge/, /ci/, and /chat/ subpaths on a single domain. Catch redirect\nloops, OAuth callback failures, and asset 404s before they hit production.\n\n## Sprint\nPart of sprint [edge-subpath-chat](https://forgejo:3000/disinto-admin/disinto-ops/pulls/37) — vision issue #623.\n\n## Acceptance criteria\n- [ ] Forgejo login at /forge/ completes without redirect loops\n- [ ] Forgejo OAuth callback for Woodpecker succeeds under subpath\n- [ ] Woodpecker dashboard loads all assets at /ci/ (no 404s on JS/CSS)\n- [ ] Chat OAuth login flow works at /chat/login\n- [ ] Forward_auth on /chat/* rejects unauthenticated requests with 401\n- [ ] Staging content loads at /staging/\n- [ ] Root / redirects to /forge/\n- [ ] CI pipeline added to .woodpecker/ to run this test on edge-related changes\n\n## Affected files\n- `nomad/jobs/edge.hcl` — edge Caddy routing config under test\n- `docker/edge/` — edge container and Caddyfile template\n- `tools/edge-control/register.sh` — route registration\n- `.woodpecker/` — CI pipeline for edge smoke test\n\n## Dependencies\nNone — first issue in sprint.\n"
|
"label": "blocked"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"action": "add_label",
|
"action": "add_label",
|
||||||
|
|
@ -10,33 +25,23 @@
|
||||||
"label": "backlog"
|
"label": "backlog"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"action": "edit_body",
|
"action": "comment",
|
||||||
"issue": 1026,
|
"issue": 1025,
|
||||||
"body": "## Goal\nReplace the blocking one-shot claude --print invocation in the chat backend with\na WebSocket connection that streams tokens to the UI as they arrive.\n\n## Sprint\nPart of sprint [edge-subpath-chat](https://forgejo:3000/disinto-admin/disinto-ops/pulls/37) — vision issue #623.\n\n## Acceptance criteria\n- [ ] /chat/ws endpoint accepts WebSocket upgrade with valid session cookie\n- [ ] /chat/ws rejects upgrade if session cookie is missing or expired\n- [ ] Chat backend streams claude output over WebSocket as text frames\n- [ ] UI renders tokens incrementally as they arrive\n- [ ] Rate limiting still enforced on WebSocket messages\n- [ ] Caddy proxies WebSocket upgrade correctly through /chat/ws with forward_auth\n\n## Affected files\n- `docker/chat/server.py` — chat backend WebSocket endpoint\n- `docker/chat/ui/` — frontend WebSocket client rendering\n- `nomad/jobs/edge.hcl` — Caddy WebSocket proxy config\n- `nomad/jobs/chat.hcl` — chat Nomad job\n\n## Dependencies\n- Depends on #1025 — subpath routing smoke test\n"
|
"body": "Gardener: removing `blocked` — fix path is well-defined (Option 1: static-checks-only pipeline). Promoting to backlog for next dev pick-up. Dev must follow the acceptance criteria literally — no live service curls, static checks only."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action": "remove_label",
|
||||||
|
"issue": 850,
|
||||||
|
"label": "blocked"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"action": "add_label",
|
"action": "add_label",
|
||||||
"issue": 1026,
|
"issue": 850,
|
||||||
"label": "backlog"
|
"label": "backlog"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"action": "edit_body",
|
"action": "comment",
|
||||||
"issue": 1027,
|
"issue": 850,
|
||||||
"body": "## Goal\nGive the chat container Claude session read-write access to the project working\ntree so the operator can inspect, explain, or modify code — scoped to that tree\nonly, with no access to factory internals, secrets, or Docker socket.\n\n## Sprint\nPart of sprint [edge-subpath-chat](https://forgejo:3000/disinto-admin/disinto-ops/pulls/37) — vision issue #623.\n\n## Acceptance criteria\n- [ ] Chat container bind-mounts the project working tree as a named volume\n- [ ] Claude invocation in server.py sets cwd to the workspace directory\n- [ ] Claude permission mode is acceptEdits (not bypassPermissions)\n- [ ] verify-chat-sandbox.sh updated to assert workspace mount exists\n- [ ] Compose generator adds the workspace volume conditionally\n\n## Affected files\n- `docker/chat/server.py` — Claude invocation and cwd setup\n- `tools/edge-control/verify-chat-sandbox.sh` — sandbox verification\n- `lib/generators.sh` — Compose generator workspace volume\n- `nomad/jobs/chat.hcl` — chat container bind-mount config\n\n## Dependencies\n- Depends on #1025 — subpath routing smoke test\n"
|
"body": "Gardener: removing `blocked` — 5th attempt recipe is at the top of this issue. Dev must follow the recipe exactly (call `_generate_compose_impl` directly in isolated FACTORY_ROOT, do NOT use `bin/disinto init`). Do not copy patterns from prior PRs."
|
||||||
},
|
|
||||||
{
|
|
||||||
"action": "add_label",
|
|
||||||
"issue": 1027,
|
|
||||||
"label": "backlog"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"action": "edit_body",
|
|
||||||
"issue": 1028,
|
|
||||||
"body": "## Goal\nIf the smoke test reveals unfixable subpath issues, automate the pivot to\nper-service subdomains so the switch is a single config change.\n\n## Sprint\nPart of sprint [edge-subpath-chat](https://forgejo:3000/disinto-admin/disinto-ops/pulls/37) — vision issue #623.\n\n## Acceptance criteria\n- [ ] generators.sh _generate_caddyfile_impl accepts EDGE_ROUTING_MODE env var\n- [ ] In subdomain mode, Caddyfile emits four host blocks per edge-routing-fallback.md\n- [ ] register.sh registers additional subdomain routes when EDGE_ROUTING_MODE=subdomain\n- [ ] OAuth redirect URIs in ci-setup.sh respect routing mode\n- [ ] .env template documents EDGE_ROUTING_MODE with a comment referencing the fallback doc\n\n## Affected files\n- `lib/generators.sh` — _generate_caddyfile_impl routing mode switch\n- `tools/edge-control/register.sh` — subdomain route registration\n- `lib/ci-setup.sh` — OAuth redirect URI handling\n- `projects/*.toml.example` — .env template documentation\n\n## Dependencies\n- Depends on #1025 — subpath routing smoke test\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"action": "add_label",
|
|
||||||
"issue": 1028,
|
|
||||||
"label": "backlog"
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# Shared Helpers (`lib/`)
|
# Shared Helpers (`lib/`)
|
||||||
|
|
||||||
All agents source `lib/env.sh` as their first action. Additional helpers are
|
All agents source `lib/env.sh` as their first action. Additional helpers are
|
||||||
|
|
@ -35,4 +35,4 @@ sourced as needed.
|
||||||
| `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) |
|
| `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) |
|
||||||
| `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) |
|
| `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) |
|
||||||
| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_ensure_kv_v2(MOUNT, [LOG_PREFIX])` — idempotent KV v2 mount assertion: enables mount if absent, fails loudly if present as wrong type/version. Extracted from all `vault-seed-*.sh` scripts to eliminate dup-detector violations. Respects `DRY_RUN=1`. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. `_hvault_seed_key(PATH, KEY, [GENERATOR])` — seed one KV key if absent; reads existing data and merges to preserve sibling keys (KV v2 replaces atomically); returns 0=created, 1=unchanged, 2=API error (#992). All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh`, `tools/vault-seed-*.sh` |
|
| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_ensure_kv_v2(MOUNT, [LOG_PREFIX])` — idempotent KV v2 mount assertion: enables mount if absent, fails loudly if present as wrong type/version. Extracted from all `vault-seed-*.sh` scripts to eliminate dup-detector violations. Respects `DRY_RUN=1`. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. `_hvault_seed_key(PATH, KEY, [GENERATOR])` — seed one KV key if absent; reads existing data and merges to preserve sibling keys (KV v2 replaces atomically); returns 0=created, 1=unchanged, 2=API error (#992). All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh`, `tools/vault-seed-*.sh` |
|
||||||
| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling; `HOST_VOLUME_DIRS` array now includes `/srv/disinto/docker` (for staging file-server, S5.2, #989, #992). `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). `wp-oauth-register.sh` — S3.3 script that creates the Woodpecker OAuth2 app in Forgejo and stores `forgejo_client`/`forgejo_secret` in Vault KV v2 at `kv/disinto/shared/woodpecker`; idempotent (skips if app or secrets already present); called by `bin/disinto --with woodpecker`. `deploy.sh` — S4 dependency-ordered Nomad job deploy + health-wait; takes a list of jobspec basenames, submits each to Nomad and polls until healthy before proceeding to the next; supports `--dry-run` and per-job timeout overrides via `JOB_READY_TIMEOUT_<JOBNAME>`; invoked by `bin/disinto --with <svc>` and `cluster-up.sh`; deploy order now covers staging, chat, edge (S5.5, #992). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` |
|
| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling; `HOST_VOLUME_DIRS` array now includes `/srv/disinto/docker` (for staging file-server, S5.2, #989, #992). `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). `wp-oauth-register.sh` — S3.3 script that creates the Woodpecker OAuth2 app in Forgejo and stores `forgejo_client`/`forgejo_secret` in Vault KV v2 at `kv/disinto/shared/woodpecker`; idempotent (skips if app or secrets already present); called by `bin/disinto --with woodpecker`. `deploy.sh` — S4 dependency-ordered Nomad job deploy + health-wait; takes a list of jobspec basenames, submits each to Nomad and polls until healthy before proceeding to the next; supports `--dry-run` and per-job timeout overrides via `JOB_READY_TIMEOUT_<JOBNAME>`; global default timeout `JOB_READY_TIMEOUT_SECS` is 360s (raised from 240s for chat cold-start, #1036); invoked by `bin/disinto --with <svc>` and `cluster-up.sh`; deploy order now covers staging, chat, edge (S5.5, #992). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` |
|
||||||
|
|
|
||||||
|
|
@ -52,8 +52,9 @@ claude_run_with_watchdog() {
|
||||||
out_file=$(mktemp) || return 1
|
out_file=$(mktemp) || return 1
|
||||||
trap 'rm -f "$out_file"' RETURN
|
trap 'rm -f "$out_file"' RETURN
|
||||||
|
|
||||||
# Start claude in background, capturing stdout to temp file
|
# Start claude in new process group (setsid creates new session, $pid is PGID leader)
|
||||||
"${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
|
# All children of claude will inherit this process group
|
||||||
|
setsid "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
|
||||||
pid=$!
|
pid=$!
|
||||||
|
|
||||||
# Background watchdog: poll for final result marker
|
# Background watchdog: poll for final result marker
|
||||||
|
|
@ -84,12 +85,12 @@ claude_run_with_watchdog() {
|
||||||
sleep "$grace"
|
sleep "$grace"
|
||||||
if kill -0 "$pid" 2>/dev/null; then
|
if kill -0 "$pid" 2>/dev/null; then
|
||||||
log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM"
|
log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM"
|
||||||
kill -TERM "$pid" 2>/dev/null || true
|
kill -TERM -- "-$pid" 2>/dev/null || true
|
||||||
# Give it a moment to clean up
|
# Give it a moment to clean up
|
||||||
sleep 5
|
sleep 5
|
||||||
if kill -0 "$pid" 2>/dev/null; then
|
if kill -0 "$pid" 2>/dev/null; then
|
||||||
log "watchdog: force kill after SIGTERM timeout"
|
log "watchdog: force kill after SIGTERM timeout"
|
||||||
kill -KILL "$pid" 2>/dev/null || true
|
kill -KILL -- "-$pid" 2>/dev/null || true
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
@ -100,16 +101,16 @@ claude_run_with_watchdog() {
|
||||||
timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null
|
timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null
|
||||||
rc=$?
|
rc=$?
|
||||||
|
|
||||||
# Clean up the watchdog
|
# Clean up the watchdog (target process group if it spawned children)
|
||||||
kill "$grace_pid" 2>/dev/null || true
|
kill -- "-$grace_pid" 2>/dev/null || true
|
||||||
wait "$grace_pid" 2>/dev/null || true
|
wait "$grace_pid" 2>/dev/null || true
|
||||||
|
|
||||||
# When timeout fires (rc=124), explicitly kill the orphaned claude process
|
# When timeout fires (rc=124), explicitly kill the orphaned claude process group
|
||||||
# tail --pid is a passive waiter, not a supervisor
|
# tail --pid is a passive waiter, not a supervisor
|
||||||
if [ "$rc" -eq 124 ]; then
|
if [ "$rc" -eq 124 ]; then
|
||||||
kill "$pid" 2>/dev/null || true
|
kill -TERM -- "-$pid" 2>/dev/null || true
|
||||||
sleep 1
|
sleep 1
|
||||||
kill -KILL "$pid" 2>/dev/null || true
|
kill -KILL -- "-$pid" 2>/dev/null || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Output the captured stdout
|
# Output the captured stdout
|
||||||
|
|
|
||||||
136
lib/backup.sh
Normal file
136
lib/backup.sh
Normal file
|
|
@ -0,0 +1,136 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# disinto backup — export factory state for migration
|
||||||
|
#
|
||||||
|
# Usage: source this file, then call backup_create <outfile.tar.gz>
|
||||||
|
# Requires: FORGE_URL, FORGE_TOKEN, FORGE_REPO, FORGE_OPS_REPO, OPS_REPO_ROOT
|
||||||
|
# =============================================================================
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Fetch all issues (open + closed) for a repo slug and emit the normalized JSON array.
|
||||||
|
# Usage: _backup_fetch_issues <org/repo>
|
||||||
|
_backup_fetch_issues() {
|
||||||
|
local repo_slug="$1"
|
||||||
|
local api_url="${FORGE_API_BASE}/repos/${repo_slug}"
|
||||||
|
|
||||||
|
local all_issues="[]"
|
||||||
|
for state in open closed; do
|
||||||
|
local page=1
|
||||||
|
while true; do
|
||||||
|
local page_items
|
||||||
|
page_items=$(curl -sf -X GET \
|
||||||
|
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
"${api_url}/issues?state=${state}&type=issues&limit=50&page=${page}") || {
|
||||||
|
echo "ERROR: failed to fetch ${state} issues from ${repo_slug} (page ${page})" >&2
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
local count
|
||||||
|
count=$(printf '%s' "$page_items" | jq 'length' 2>/dev/null) || count=0
|
||||||
|
[ -z "$count" ] && count=0
|
||||||
|
[ "$count" -eq 0 ] && break
|
||||||
|
all_issues=$(printf '%s\n%s' "$all_issues" "$page_items" | jq -s 'add')
|
||||||
|
[ "$count" -lt 50 ] && break
|
||||||
|
page=$((page + 1))
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
# Normalize to the schema: number, title, body, labels, state
|
||||||
|
printf '%s' "$all_issues" | jq '[.[] | {
|
||||||
|
number: .number,
|
||||||
|
title: .title,
|
||||||
|
body: .body,
|
||||||
|
labels: [.labels[]?.name],
|
||||||
|
state: .state
|
||||||
|
}] | sort_by(.number)'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create a backup tarball of factory state.
|
||||||
|
# Usage: backup_create <outfile.tar.gz>
|
||||||
|
backup_create() {
|
||||||
|
local outfile="${1:-}"
|
||||||
|
if [ -z "$outfile" ]; then
|
||||||
|
echo "Error: output file required" >&2
|
||||||
|
echo "Usage: disinto backup create <outfile.tar.gz>" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Resolve to absolute path before cd-ing into tmpdir
|
||||||
|
case "$outfile" in
|
||||||
|
/*) ;;
|
||||||
|
*) outfile="$(pwd)/${outfile}" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Validate required env
|
||||||
|
: "${FORGE_URL:?FORGE_URL must be set}"
|
||||||
|
: "${FORGE_TOKEN:?FORGE_TOKEN must be set}"
|
||||||
|
: "${FORGE_REPO:?FORGE_REPO must be set}"
|
||||||
|
|
||||||
|
local forge_ops_repo="${FORGE_OPS_REPO:-${FORGE_REPO}-ops}"
|
||||||
|
local ops_repo_root="${OPS_REPO_ROOT:-}"
|
||||||
|
|
||||||
|
if [ -z "$ops_repo_root" ] || [ ! -d "$ops_repo_root/.git" ]; then
|
||||||
|
echo "Error: OPS_REPO_ROOT (${ops_repo_root:-<unset>}) is not a valid git repo" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local tmpdir
|
||||||
|
tmpdir=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$tmpdir"' EXIT
|
||||||
|
|
||||||
|
local project_name="${FORGE_REPO##*/}"
|
||||||
|
|
||||||
|
echo "=== disinto backup create ==="
|
||||||
|
echo "Forge: ${FORGE_URL}"
|
||||||
|
echo "Repos: ${FORGE_REPO}, ${forge_ops_repo}"
|
||||||
|
|
||||||
|
# ── 1. Export issues ──────────────────────────────────────────────────────
|
||||||
|
mkdir -p "${tmpdir}/issues"
|
||||||
|
|
||||||
|
echo "Fetching issues for ${FORGE_REPO}..."
|
||||||
|
_backup_fetch_issues "$FORGE_REPO" > "${tmpdir}/issues/${project_name}.json"
|
||||||
|
local main_count
|
||||||
|
main_count=$(jq 'length' "${tmpdir}/issues/${project_name}.json")
|
||||||
|
echo " ${main_count} issues exported"
|
||||||
|
|
||||||
|
echo "Fetching issues for ${forge_ops_repo}..."
|
||||||
|
_backup_fetch_issues "$forge_ops_repo" > "${tmpdir}/issues/${project_name}-ops.json"
|
||||||
|
local ops_count
|
||||||
|
ops_count=$(jq 'length' "${tmpdir}/issues/${project_name}-ops.json")
|
||||||
|
echo " ${ops_count} issues exported"
|
||||||
|
|
||||||
|
# ── 2. Git bundle of ops repo ────────────────────────────────────────────
|
||||||
|
mkdir -p "${tmpdir}/repos"
|
||||||
|
|
||||||
|
echo "Creating git bundle for ${forge_ops_repo}..."
|
||||||
|
git -C "$ops_repo_root" bundle create "${tmpdir}/repos/${project_name}-ops.bundle" --all 2>&1
|
||||||
|
echo " bundle created ($(du -h "${tmpdir}/repos/${project_name}-ops.bundle" | cut -f1))"
|
||||||
|
|
||||||
|
# ── 3. Metadata ──────────────────────────────────────────────────────────
|
||||||
|
local created_at
|
||||||
|
created_at=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
|
||||||
|
jq -n \
|
||||||
|
--arg created_at "$created_at" \
|
||||||
|
--arg source_host "$(hostname)" \
|
||||||
|
--argjson schema_version 1 \
|
||||||
|
--arg forgejo_url "$FORGE_URL" \
|
||||||
|
'{
|
||||||
|
created_at: $created_at,
|
||||||
|
source_host: $source_host,
|
||||||
|
schema_version: $schema_version,
|
||||||
|
forgejo_url: $forgejo_url
|
||||||
|
}' > "${tmpdir}/metadata.json"
|
||||||
|
|
||||||
|
# ── 4. Pack tarball ──────────────────────────────────────────────────────
|
||||||
|
echo "Creating tarball: ${outfile}"
|
||||||
|
tar -czf "$outfile" -C "$tmpdir" metadata.json issues repos
|
||||||
|
local size
|
||||||
|
size=$(du -h "$outfile" | cut -f1)
|
||||||
|
echo "=== Backup complete: ${outfile} (${size}) ==="
|
||||||
|
|
||||||
|
# Clean up before returning — the EXIT trap references the local $tmpdir
|
||||||
|
# which goes out of scope after return, causing 'unbound variable' under set -u.
|
||||||
|
trap - EXIT
|
||||||
|
rm -rf "$tmpdir"
|
||||||
|
}
|
||||||
|
|
@ -247,6 +247,31 @@ ci_promote() {
|
||||||
echo "$new_num"
|
echo "$new_num"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ci_get_step_logs <pipeline_num> <step_id>
|
||||||
|
# Fetches logs for a single CI step via the Woodpecker API.
|
||||||
|
# Requires: WOODPECKER_REPO_ID, woodpecker_api() (from env.sh)
|
||||||
|
# Returns: 0 on success, 1 on failure. Outputs log text to stdout.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ci_get_step_logs 1423 5 # Get logs for step ID 5 in pipeline 1423
|
||||||
|
ci_get_step_logs() {
|
||||||
|
local pipeline_num="$1" step_id="$2"
|
||||||
|
|
||||||
|
if [ -z "$pipeline_num" ] || [ -z "$step_id" ]; then
|
||||||
|
echo "Usage: ci_get_step_logs <pipeline_num> <step_id>" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "${WOODPECKER_REPO_ID:-}" ] || [ "${WOODPECKER_REPO_ID}" = "0" ]; then
|
||||||
|
echo "ERROR: WOODPECKER_REPO_ID not set or zero" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${pipeline_num}/${step_id}" \
|
||||||
|
--max-time 15 2>/dev/null \
|
||||||
|
| jq -r '.[].data // empty' 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
# ci_get_logs <pipeline_number> [--step <step_name>]
|
# ci_get_logs <pipeline_number> [--step <step_name>]
|
||||||
# Reads CI logs from the Woodpecker SQLite database.
|
# Reads CI logs from the Woodpecker SQLite database.
|
||||||
# Requires: WOODPECKER_DATA_DIR env var or mounted volume at /woodpecker-data
|
# Requires: WOODPECKER_DATA_DIR env var or mounted volume at /woodpecker-data
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,28 @@ PROJECT_NAME="${PROJECT_NAME:-project}"
|
||||||
# PRIMARY_BRANCH defaults to main (env.sh may have set it to 'master')
|
# PRIMARY_BRANCH defaults to main (env.sh may have set it to 'master')
|
||||||
PRIMARY_BRANCH="${PRIMARY_BRANCH:-main}"
|
PRIMARY_BRANCH="${PRIMARY_BRANCH:-main}"
|
||||||
|
|
||||||
|
# Track service names for duplicate detection
|
||||||
|
declare -A _seen_services
|
||||||
|
declare -A _service_sources
|
||||||
|
|
||||||
|
# Record a service name and its source; return 0 if unique, 1 if duplicate
|
||||||
|
_record_service() {
|
||||||
|
local service_name="$1"
|
||||||
|
local source="$2"
|
||||||
|
|
||||||
|
if [ -n "${_seen_services[$service_name]:-}" ]; then
|
||||||
|
local original_source="${_service_sources[$service_name]}"
|
||||||
|
echo "ERROR: Duplicate service name '$service_name' detected —" >&2
|
||||||
|
echo " '$service_name' emitted twice — from $original_source and from $source" >&2
|
||||||
|
echo " Remove one of the conflicting activations to proceed." >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
_seen_services[$service_name]=1
|
||||||
|
_service_sources[$service_name]="$source"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
# Helper: extract woodpecker_repo_id from a project TOML file
|
# Helper: extract woodpecker_repo_id from a project TOML file
|
||||||
# Returns empty string if not found or file doesn't exist
|
# Returns empty string if not found or file doesn't exist
|
||||||
_get_woodpecker_repo_id() {
|
_get_woodpecker_repo_id() {
|
||||||
|
|
@ -97,6 +119,16 @@ _generate_local_model_services() {
|
||||||
POLL_INTERVAL) poll_interval_val="$value" ;;
|
POLL_INTERVAL) poll_interval_val="$value" ;;
|
||||||
---)
|
---)
|
||||||
if [ -n "$service_name" ] && [ -n "$base_url" ]; then
|
if [ -n "$service_name" ] && [ -n "$base_url" ]; then
|
||||||
|
# Record service for duplicate detection using the full service name
|
||||||
|
local full_service_name="agents-${service_name}"
|
||||||
|
local toml_basename
|
||||||
|
toml_basename=$(basename "$toml")
|
||||||
|
if ! _record_service "$full_service_name" "[agents.$service_name] in projects/$toml_basename"; then
|
||||||
|
# Duplicate detected — clean up and abort
|
||||||
|
rm -f "$temp_file"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Per-agent FORGE_TOKEN / FORGE_PASS lookup (#834 Gap 3).
|
# Per-agent FORGE_TOKEN / FORGE_PASS lookup (#834 Gap 3).
|
||||||
# Two hired llama agents must not share the same Forgejo identity,
|
# Two hired llama agents must not share the same Forgejo identity,
|
||||||
# so we key the env-var lookup by forge_user (which hire-agent.sh
|
# so we key the env-var lookup by forge_user (which hire-agent.sh
|
||||||
|
|
@ -281,6 +313,21 @@ _generate_compose_impl() {
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Reset duplicate detection state for fresh run
|
||||||
|
_seen_services=()
|
||||||
|
_service_sources=()
|
||||||
|
|
||||||
|
# Initialize duplicate detection with base services defined in the template
|
||||||
|
_record_service "forgejo" "base compose template" || return 1
|
||||||
|
_record_service "woodpecker" "base compose template" || return 1
|
||||||
|
_record_service "woodpecker-agent" "base compose template" || return 1
|
||||||
|
_record_service "agents" "base compose template" || return 1
|
||||||
|
_record_service "runner" "base compose template" || return 1
|
||||||
|
_record_service "edge" "base compose template" || return 1
|
||||||
|
_record_service "staging" "base compose template" || return 1
|
||||||
|
_record_service "staging-deploy" "base compose template" || return 1
|
||||||
|
_record_service "chat" "base compose template" || return 1
|
||||||
|
|
||||||
# Extract primary woodpecker_repo_id from project TOML files
|
# Extract primary woodpecker_repo_id from project TOML files
|
||||||
local wp_repo_id
|
local wp_repo_id
|
||||||
wp_repo_id=$(_get_primary_woodpecker_repo_id)
|
wp_repo_id=$(_get_primary_woodpecker_repo_id)
|
||||||
|
|
@ -358,6 +405,9 @@ services:
|
||||||
WOODPECKER_SERVER: localhost:9000
|
WOODPECKER_SERVER: localhost:9000
|
||||||
WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-}
|
WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-}
|
||||||
WOODPECKER_GRPC_SECURE: "false"
|
WOODPECKER_GRPC_SECURE: "false"
|
||||||
|
WOODPECKER_GRPC_KEEPALIVE_TIME: "10s"
|
||||||
|
WOODPECKER_GRPC_KEEPALIVE_TIMEOUT: "20s"
|
||||||
|
WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS: "true"
|
||||||
WOODPECKER_HEALTHCHECK_ADDR: ":3333"
|
WOODPECKER_HEALTHCHECK_ADDR: ":3333"
|
||||||
WOODPECKER_BACKEND_DOCKER_NETWORK: ${WOODPECKER_CI_NETWORK:-disinto_disinto-net}
|
WOODPECKER_BACKEND_DOCKER_NETWORK: ${WOODPECKER_CI_NETWORK:-disinto_disinto-net}
|
||||||
WOODPECKER_MAX_WORKFLOWS: 1
|
WOODPECKER_MAX_WORKFLOWS: 1
|
||||||
|
|
@ -436,6 +486,76 @@ services:
|
||||||
|
|
||||||
COMPOSEEOF
|
COMPOSEEOF
|
||||||
|
|
||||||
|
# ── Conditional agents-llama block (ENABLE_LLAMA_AGENT=1) ──────────────
|
||||||
|
# This legacy flag was removed in #846 but kept for duplicate detection testing
|
||||||
|
if [ "${ENABLE_LLAMA_AGENT:-0}" = "1" ]; then
|
||||||
|
if ! _record_service "agents-llama" "ENABLE_LLAMA_AGENT=1"; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
cat >> "$compose_file" <<'COMPOSEEOF'
|
||||||
|
|
||||||
|
agents-llama:
|
||||||
|
image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}
|
||||||
|
container_name: disinto-agents-llama
|
||||||
|
restart: unless-stopped
|
||||||
|
security_opt:
|
||||||
|
- apparmor=unconfined
|
||||||
|
volumes:
|
||||||
|
- agent-data:/home/agent/data
|
||||||
|
- project-repos:/home/agent/repos
|
||||||
|
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
|
||||||
|
- ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
|
||||||
|
- ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
|
||||||
|
- woodpecker-data:/woodpecker-data:ro
|
||||||
|
- ./projects:/home/agent/disinto/projects:ro
|
||||||
|
- ./.env:/home/agent/disinto/.env:ro
|
||||||
|
- ./state:/home/agent/disinto/state
|
||||||
|
environment:
|
||||||
|
FORGE_URL: http://forgejo:3000
|
||||||
|
FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
|
||||||
|
FORGE_TOKEN: ${FORGE_TOKEN:-}
|
||||||
|
FORGE_REVIEW_TOKEN: ${FORGE_REVIEW_TOKEN:-}
|
||||||
|
FORGE_PLANNER_TOKEN: ${FORGE_PLANNER_TOKEN:-}
|
||||||
|
FORGE_GARDENER_TOKEN: ${FORGE_GARDENER_TOKEN:-}
|
||||||
|
FORGE_VAULT_TOKEN: ${FORGE_VAULT_TOKEN:-}
|
||||||
|
FORGE_SUPERVISOR_TOKEN: ${FORGE_SUPERVISOR_TOKEN:-}
|
||||||
|
FORGE_PREDICTOR_TOKEN: ${FORGE_PREDICTOR_TOKEN:-}
|
||||||
|
FORGE_ARCHITECT_TOKEN: ${FORGE_ARCHITECT_TOKEN:-}
|
||||||
|
FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-}
|
||||||
|
WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-}
|
||||||
|
CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200}
|
||||||
|
CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
|
||||||
|
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
|
||||||
|
FORGE_PASS: ${FORGE_PASS:-}
|
||||||
|
FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-}
|
||||||
|
FACTORY_REPO: ${FORGE_REPO:-disinto-admin/disinto}
|
||||||
|
DISINTO_CONTAINER: "1"
|
||||||
|
PROJECT_NAME: ${PROJECT_NAME:-project}
|
||||||
|
PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
|
||||||
|
WOODPECKER_DATA_DIR: /woodpecker-data
|
||||||
|
WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID"
|
||||||
|
CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
|
||||||
|
POLL_INTERVAL: ${POLL_INTERVAL:-300}
|
||||||
|
GARDENER_INTERVAL: ${GARDENER_INTERVAL:-21600}
|
||||||
|
ARCHITECT_INTERVAL: ${ARCHITECT_INTERVAL:-21600}
|
||||||
|
PLANNER_INTERVAL: ${PLANNER_INTERVAL:-43200}
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
|
||||||
|
interval: 60s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 30s
|
||||||
|
depends_on:
|
||||||
|
forgejo:
|
||||||
|
condition: service_healthy
|
||||||
|
woodpecker:
|
||||||
|
condition: service_started
|
||||||
|
networks:
|
||||||
|
- disinto-net
|
||||||
|
|
||||||
|
COMPOSEEOF
|
||||||
|
fi
|
||||||
|
|
||||||
# Resume the rest of the compose file (runner onward)
|
# Resume the rest of the compose file (runner onward)
|
||||||
cat >> "$compose_file" <<'COMPOSEEOF'
|
cat >> "$compose_file" <<'COMPOSEEOF'
|
||||||
|
|
||||||
|
|
@ -631,7 +751,10 @@ COMPOSEEOF
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Append local-model agent services if any are configured
|
# Append local-model agent services if any are configured
|
||||||
_generate_local_model_services "$compose_file"
|
if ! _generate_local_model_services "$compose_file"; then
|
||||||
|
echo "ERROR: Failed to generate local-model agent services. See errors above." >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Resolve the Claude CLI binary path and persist as CLAUDE_BIN_DIR in .env.
|
# Resolve the Claude CLI binary path and persist as CLAUDE_BIN_DIR in .env.
|
||||||
# Only used by reproduce and edge services which still use host-mounted CLI.
|
# Only used by reproduce and edge services which still use host-mounted CLI.
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@
|
||||||
# Environment:
|
# Environment:
|
||||||
# REPO_ROOT — absolute path to repo root (defaults to parent of
|
# REPO_ROOT — absolute path to repo root (defaults to parent of
|
||||||
# this script's parent directory)
|
# this script's parent directory)
|
||||||
# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 240)
|
# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360)
|
||||||
# JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
|
# JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
|
||||||
# JOB_READY_TIMEOUT_FORGEJO=300)
|
# JOB_READY_TIMEOUT_FORGEJO=300)
|
||||||
#
|
#
|
||||||
|
|
@ -33,7 +33,7 @@ set -euo pipefail
|
||||||
# ── Configuration ────────────────────────────────────────────────────────────
|
# ── Configuration ────────────────────────────────────────────────────────────
|
||||||
SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
|
REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
|
||||||
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-240}"
|
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}"
|
||||||
|
|
||||||
DRY_RUN=0
|
DRY_RUN=0
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -157,9 +157,10 @@ issue_claim() {
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
local ip_id bl_id
|
local ip_id bl_id bk_id
|
||||||
ip_id=$(_ilc_in_progress_id)
|
ip_id=$(_ilc_in_progress_id)
|
||||||
bl_id=$(_ilc_backlog_id)
|
bl_id=$(_ilc_backlog_id)
|
||||||
|
bk_id=$(_ilc_blocked_id)
|
||||||
if [ -n "$ip_id" ]; then
|
if [ -n "$ip_id" ]; then
|
||||||
curl -sf -X POST \
|
curl -sf -X POST \
|
||||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||||
|
|
@ -172,6 +173,12 @@ issue_claim() {
|
||||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||||
"${FORGE_API}/issues/${issue}/labels/${bl_id}" >/dev/null 2>&1 || true
|
"${FORGE_API}/issues/${issue}/labels/${bl_id}" >/dev/null 2>&1 || true
|
||||||
fi
|
fi
|
||||||
|
# Clear blocked label on re-claim — starting work is implicit resolution of prior block
|
||||||
|
if [ -n "$bk_id" ]; then
|
||||||
|
curl -sf -X DELETE \
|
||||||
|
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||||
|
"${FORGE_API}/issues/${issue}/labels/${bk_id}" >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
_ilc_log "claimed issue #${issue}"
|
_ilc_log "claimed issue #${issue}"
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -429,19 +429,100 @@ pr_walk_to_merge() {
|
||||||
|
|
||||||
_prl_log "CI failed — invoking agent (attempt ${ci_fix_count}/${max_ci_fixes})"
|
_prl_log "CI failed — invoking agent (attempt ${ci_fix_count}/${max_ci_fixes})"
|
||||||
|
|
||||||
# Get CI logs from SQLite database if available
|
# Build per-workflow/per-step CI diagnostics prompt
|
||||||
|
local ci_prompt_body=""
|
||||||
|
local passing_workflows=""
|
||||||
|
local built_diagnostics=false
|
||||||
|
|
||||||
|
if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${WOODPECKER_REPO_ID:-}" ]; then
|
||||||
|
local pip_json
|
||||||
|
pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_PR_CI_PIPELINE}" 2>/dev/null) || pip_json=""
|
||||||
|
|
||||||
|
if [ -n "$pip_json" ]; then
|
||||||
|
local wf_count
|
||||||
|
wf_count=$(printf '%s' "$pip_json" | jq '[.workflows[]?] | length' 2>/dev/null) || wf_count=0
|
||||||
|
|
||||||
|
if [ "$wf_count" -gt 0 ]; then
|
||||||
|
built_diagnostics=true
|
||||||
|
local wf_idx=0
|
||||||
|
while [ "$wf_idx" -lt "$wf_count" ]; do
|
||||||
|
local wf_name wf_state
|
||||||
|
wf_name=$(printf '%s' "$pip_json" | jq -r ".workflows[$wf_idx].name // \"workflow-$wf_idx\"" 2>/dev/null)
|
||||||
|
wf_state=$(printf '%s' "$pip_json" | jq -r ".workflows[$wf_idx].state // \"unknown\"" 2>/dev/null)
|
||||||
|
|
||||||
|
if [ "$wf_state" = "failure" ] || [ "$wf_state" = "error" ] || [ "$wf_state" = "killed" ]; then
|
||||||
|
# Collect failed children for this workflow
|
||||||
|
local failed_children
|
||||||
|
failed_children=$(printf '%s' "$pip_json" | jq -r "
|
||||||
|
.workflows[$wf_idx].children[]? |
|
||||||
|
select(.state == \"failure\" or .state == \"error\" or .state == \"killed\") |
|
||||||
|
\"\(.name)\t\(.exit_code)\t\(.pid)\"" 2>/dev/null) || failed_children=""
|
||||||
|
|
||||||
|
ci_prompt_body="${ci_prompt_body}
|
||||||
|
--- Failed workflow: ${wf_name} ---"
|
||||||
|
if [ -n "$failed_children" ]; then
|
||||||
|
while IFS=$'\t' read -r step_name step_exit step_pid; do
|
||||||
|
[ -z "$step_name" ] && continue
|
||||||
|
local exit_annotation=""
|
||||||
|
case "$step_exit" in
|
||||||
|
126) exit_annotation=" (permission denied or not executable)" ;;
|
||||||
|
127) exit_annotation=" (command not found)" ;;
|
||||||
|
128) exit_annotation=" (invalid exit argument / signal+128)" ;;
|
||||||
|
esac
|
||||||
|
ci_prompt_body="${ci_prompt_body}
|
||||||
|
Step: ${step_name}
|
||||||
|
Exit code: ${step_exit}${exit_annotation}"
|
||||||
|
|
||||||
|
# Fetch per-step logs
|
||||||
|
if [ -n "$step_pid" ] && [ "$step_pid" != "null" ]; then
|
||||||
|
local step_logs
|
||||||
|
step_logs=$(ci_get_step_logs "$_PR_CI_PIPELINE" "$step_pid" 2>/dev/null | tail -50) || step_logs=""
|
||||||
|
if [ -n "$step_logs" ]; then
|
||||||
|
ci_prompt_body="${ci_prompt_body}
|
||||||
|
Log tail (last 50 lines):
|
||||||
|
\`\`\`
|
||||||
|
${step_logs}
|
||||||
|
\`\`\`"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done <<< "$failed_children"
|
||||||
|
else
|
||||||
|
ci_prompt_body="${ci_prompt_body}
|
||||||
|
(no failed step details available)"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# Track passing/other workflows
|
||||||
|
if [ -n "$passing_workflows" ]; then
|
||||||
|
passing_workflows="${passing_workflows}, ${wf_name}"
|
||||||
|
else
|
||||||
|
passing_workflows="${wf_name}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
wf_idx=$((wf_idx + 1))
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fallback: use legacy log fetch if per-workflow diagnostics unavailable
|
||||||
|
if [ "$built_diagnostics" = false ]; then
|
||||||
local ci_logs=""
|
local ci_logs=""
|
||||||
if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${FACTORY_ROOT:-}" ]; then
|
if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${FACTORY_ROOT:-}" ]; then
|
||||||
ci_logs=$(ci_get_logs "$_PR_CI_PIPELINE" 2>/dev/null | tail -50) || ci_logs=""
|
ci_logs=$(ci_get_logs "$_PR_CI_PIPELINE" 2>/dev/null | tail -50) || ci_logs=""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
local logs_section=""
|
|
||||||
if [ -n "$ci_logs" ]; then
|
if [ -n "$ci_logs" ]; then
|
||||||
logs_section="
|
ci_prompt_body="
|
||||||
CI Log Output (last 50 lines):
|
CI Log Output (last 50 lines):
|
||||||
\`\`\`
|
\`\`\`
|
||||||
${ci_logs}
|
${ci_logs}
|
||||||
\`\`\`
|
\`\`\`"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
local passing_line=""
|
||||||
|
if [ -n "$passing_workflows" ]; then
|
||||||
|
passing_line="
|
||||||
|
Passing workflows (do not modify): ${passing_workflows}
|
||||||
"
|
"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
@ -450,9 +531,10 @@ ${ci_logs}
|
||||||
|
|
||||||
Pipeline: #${_PR_CI_PIPELINE:-?}
|
Pipeline: #${_PR_CI_PIPELINE:-?}
|
||||||
Failure type: ${_PR_CI_FAILURE_TYPE:-unknown}
|
Failure type: ${_PR_CI_FAILURE_TYPE:-unknown}
|
||||||
|
${passing_line}
|
||||||
Error log:
|
Error log:
|
||||||
${_PR_CI_ERROR_LOG:-No logs available.}${logs_section}
|
${_PR_CI_ERROR_LOG:-No logs available.}
|
||||||
|
${ci_prompt_body}
|
||||||
|
|
||||||
Fix the issue, run tests, commit, rebase on ${PRIMARY_BRANCH}, and push:
|
Fix the issue, run tests, commit, rebase on ${PRIMARY_BRANCH}, and push:
|
||||||
git fetch ${remote} ${PRIMARY_BRANCH} && git rebase ${remote}/${PRIMARY_BRANCH}
|
git fetch ${remote} ${PRIMARY_BRANCH} && git rebase ${remote}/${PRIMARY_BRANCH}
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# nomad/ — Agent Instructions
|
# nomad/ — Agent Instructions
|
||||||
|
|
||||||
Nomad + Vault HCL for the factory's single-node cluster. These files are
|
Nomad + Vault HCL for the factory's single-node cluster. These files are
|
||||||
|
|
@ -21,7 +21,7 @@ see issues #821–#992 for the step breakdown.
|
||||||
| `jobs/agents.hcl` | submitted via `lib/init/nomad/deploy.sh` | All 7 agent roles (dev, review, gardener, planner, predictor, supervisor, architect) + llama variant; Vault-templated bot tokens via `service-agents` policy; `force_pull = false` — image is built locally by `bin/disinto --with agents`, no registry (S4.1, S4-fix-2, S4-fix-5, #955, #972, #978) |
|
| `jobs/agents.hcl` | submitted via `lib/init/nomad/deploy.sh` | All 7 agent roles (dev, review, gardener, planner, predictor, supervisor, architect) + llama variant; Vault-templated bot tokens via `service-agents` policy; `force_pull = false` — image is built locally by `bin/disinto --with agents`, no registry (S4.1, S4-fix-2, S4-fix-5, #955, #972, #978) |
|
||||||
| `jobs/staging.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy file-server mounting `docker/` as `/srv/site:ro`; no Vault integration; **dynamic host port** (no static 80 — edge owns 80/443, collision fixed in S5-fix-7 #1018); edge discovers via Nomad service registration (S5.2, #989) |
|
| `jobs/staging.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy file-server mounting `docker/` as `/srv/site:ro`; no Vault integration; **dynamic host port** (no static 80 — edge owns 80/443, collision fixed in S5-fix-7 #1018); edge discovers via Nomad service registration (S5.2, #989) |
|
||||||
| `jobs/chat.hcl` | submitted via `lib/init/nomad/deploy.sh` | Claude chat UI; custom `disinto/chat:local` image; sandbox hardening (cap_drop ALL, **tmpfs via mount block** not `tmpfs=` arg — S5-fix-5 #1012, pids_limit 128); Vault-templated OAuth secrets via `service-chat` policy (S5.2, #989) |
|
| `jobs/chat.hcl` | submitted via `lib/init/nomad/deploy.sh` | Claude chat UI; custom `disinto/chat:local` image; sandbox hardening (cap_drop ALL, **tmpfs via mount block** not `tmpfs=` arg — S5-fix-5 #1012, pids_limit 128); Vault-templated OAuth secrets via `service-chat` policy (S5.2, #989) |
|
||||||
| `jobs/edge.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy reverse proxy + dispatcher sidecar; routes /forge, /woodpecker, /staging, /chat; uses `disinto/edge:local` image built by `bin/disinto --with edge`; Vault-templated ops-repo creds via `service-dispatcher` policy (S5.1, #988) |
|
| `jobs/edge.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy reverse proxy + dispatcher sidecar; routes /forge, /woodpecker, /staging, /chat; uses `disinto/edge:local` image built by `bin/disinto --with edge`; **both Caddy and dispatcher tasks use `network_mode = "host"`** — upstreams are `127.0.0.1:<port>` (forgejo :3000, woodpecker :8000, chat :8080), not Docker hostnames (#1031, #1034); `FORGE_URL` rendered via Nomad service discovery template (not static env) to handle bridge vs. host network differences (#1034); dispatcher Vault secret path changed to `kv/data/disinto/shared/ops-repo` (#1041); Vault-templated ops-repo creds via `service-dispatcher` policy (S5.1, #988) |
|
||||||
|
|
||||||
Nomad auto-merges every `*.hcl` under `-config=/etc/nomad.d/`, so the
|
Nomad auto-merges every `*.hcl` under `-config=/etc/nomad.d/`, so the
|
||||||
split between `server.hcl` and `client.hcl` is for readability, not
|
split between `server.hcl` and `client.hcl` is for readability, not
|
||||||
|
|
|
||||||
|
|
@ -123,6 +123,19 @@ job "edge" {
|
||||||
# ── Caddyfile via Nomad service discovery (S5-fix-7, issue #1018) ────
|
# ── Caddyfile via Nomad service discovery (S5-fix-7, issue #1018) ────
|
||||||
# Renders staging upstream from Nomad service registration instead of
|
# Renders staging upstream from Nomad service registration instead of
|
||||||
# hardcoded staging:80. Caddy picks up /local/Caddyfile via entrypoint.
|
# hardcoded staging:80. Caddy picks up /local/Caddyfile via entrypoint.
|
||||||
|
# Forge URL via Nomad service discovery (issue #1034) — resolves forgejo
|
||||||
|
# service address/port dynamically for bridge network compatibility.
|
||||||
|
template {
|
||||||
|
destination = "local/forge.env"
|
||||||
|
env = true
|
||||||
|
change_mode = "restart"
|
||||||
|
data = <<EOT
|
||||||
|
{{ range service "forgejo" -}}
|
||||||
|
FORGE_URL=http://{{ .Address }}:{{ .Port }}
|
||||||
|
{{- end }}
|
||||||
|
EOT
|
||||||
|
}
|
||||||
|
|
||||||
template {
|
template {
|
||||||
destination = "local/Caddyfile"
|
destination = "local/Caddyfile"
|
||||||
change_mode = "restart"
|
change_mode = "restart"
|
||||||
|
|
@ -174,7 +187,6 @@ EOT
|
||||||
|
|
||||||
# ── Non-secret env ───────────────────────────────────────────────────
|
# ── Non-secret env ───────────────────────────────────────────────────
|
||||||
env {
|
env {
|
||||||
FORGE_URL = "http://127.0.0.1:3000"
|
|
||||||
FORGE_REPO = "disinto-admin/disinto"
|
FORGE_REPO = "disinto-admin/disinto"
|
||||||
DISINTO_CONTAINER = "1"
|
DISINTO_CONTAINER = "1"
|
||||||
PROJECT_NAME = "disinto"
|
PROJECT_NAME = "disinto"
|
||||||
|
|
@ -213,6 +225,21 @@ EOT
|
||||||
read_only = false
|
read_only = false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── Forge URL via Nomad service discovery (issue #1034) ──────────
|
||||||
|
# Resolves forgejo service address/port dynamically for bridge network
|
||||||
|
# compatibility. Template-scoped to dispatcher task (Nomad doesn't
|
||||||
|
# propagate templates across tasks).
|
||||||
|
template {
|
||||||
|
destination = "local/forge.env"
|
||||||
|
env = true
|
||||||
|
change_mode = "restart"
|
||||||
|
data = <<EOT
|
||||||
|
{{ range service "forgejo" -}}
|
||||||
|
FORGE_URL=http://{{ .Address }}:{{ .Port }}
|
||||||
|
{{- end }}
|
||||||
|
EOT
|
||||||
|
}
|
||||||
|
|
||||||
# ── Vault-templated secrets (S5.1, issue #988) ──────────────────────
|
# ── Vault-templated secrets (S5.1, issue #988) ──────────────────────
|
||||||
# Renders FORGE_TOKEN from Vault KV v2 for ops repo access.
|
# Renders FORGE_TOKEN from Vault KV v2 for ops repo access.
|
||||||
template {
|
template {
|
||||||
|
|
@ -221,10 +248,10 @@ EOT
|
||||||
change_mode = "restart"
|
change_mode = "restart"
|
||||||
error_on_missing_key = false
|
error_on_missing_key = false
|
||||||
data = <<EOT
|
data = <<EOT
|
||||||
{{- with secret "kv/data/disinto/bots/vault" -}}
|
{{- with secret "kv/data/disinto/shared/ops-repo" -}}
|
||||||
FORGE_TOKEN={{ .Data.data.token }}
|
FORGE_TOKEN={{ .Data.data.token }}
|
||||||
{{- else -}}
|
{{- else -}}
|
||||||
# WARNING: kv/disinto/bots/vault is empty — run tools/vault-seed-agents.sh
|
# WARNING: kv/disinto/shared/ops-repo is empty — run tools/vault-seed-ops-repo.sh
|
||||||
FORGE_TOKEN=seed-me
|
FORGE_TOKEN=seed-me
|
||||||
{{- end }}
|
{{- end }}
|
||||||
EOT
|
EOT
|
||||||
|
|
@ -233,7 +260,6 @@ EOT
|
||||||
# ── Non-secret env ───────────────────────────────────────────────────
|
# ── Non-secret env ───────────────────────────────────────────────────
|
||||||
env {
|
env {
|
||||||
DISPATCHER_BACKEND = "nomad"
|
DISPATCHER_BACKEND = "nomad"
|
||||||
FORGE_URL = "http://127.0.0.1:3000"
|
|
||||||
FORGE_REPO = "disinto-admin/disinto"
|
FORGE_REPO = "disinto-admin/disinto"
|
||||||
FORGE_OPS_REPO = "disinto-admin/disinto-ops"
|
FORGE_OPS_REPO = "disinto-admin/disinto-ops"
|
||||||
PRIMARY_BRANCH = "main"
|
PRIMARY_BRANCH = "main"
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ job "woodpecker-agent" {
|
||||||
check {
|
check {
|
||||||
type = "http"
|
type = "http"
|
||||||
path = "/healthz"
|
path = "/healthz"
|
||||||
interval = "15s"
|
interval = "10s"
|
||||||
timeout = "3s"
|
timeout = "3s"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -91,6 +91,9 @@ job "woodpecker-agent" {
|
||||||
env {
|
env {
|
||||||
WOODPECKER_SERVER = "${attr.unique.network.ip-address}:9000"
|
WOODPECKER_SERVER = "${attr.unique.network.ip-address}:9000"
|
||||||
WOODPECKER_GRPC_SECURE = "false"
|
WOODPECKER_GRPC_SECURE = "false"
|
||||||
|
WOODPECKER_GRPC_KEEPALIVE_TIME = "10s"
|
||||||
|
WOODPECKER_GRPC_KEEPALIVE_TIMEOUT = "20s"
|
||||||
|
WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS = "true"
|
||||||
WOODPECKER_MAX_WORKFLOWS = "1"
|
WOODPECKER_MAX_WORKFLOWS = "1"
|
||||||
WOODPECKER_HEALTHCHECK_ADDR = ":3333"
|
WOODPECKER_HEALTHCHECK_ADDR = ":3333"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# Planner Agent
|
# Planner Agent
|
||||||
|
|
||||||
**Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints),
|
**Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints),
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# Predictor Agent
|
# Predictor Agent
|
||||||
|
|
||||||
**Role**: Abstract adversary (the "goblin"). Runs a 2-step formula
|
**Role**: Abstract adversary (the "goblin"). Runs a 2-step formula
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# Review Agent
|
# Review Agent
|
||||||
|
|
||||||
**Role**: AI-powered PR review — post structured findings and formal
|
**Role**: AI-powered PR review — post structured findings and formal
|
||||||
|
|
|
||||||
|
|
@ -52,8 +52,35 @@ REVIEW_TMPDIR=$(mktemp -d)
|
||||||
|
|
||||||
log() { printf '[%s] PR#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" >> "$LOGFILE"; }
|
log() { printf '[%s] PR#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" >> "$LOGFILE"; }
|
||||||
status() { printf '[%s] PR #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" > "$STATUSFILE"; log "$*"; }
|
status() { printf '[%s] PR #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" > "$STATUSFILE"; log "$*"; }
|
||||||
cleanup() { rm -rf "$REVIEW_TMPDIR" "$LOCKFILE" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"; }
|
|
||||||
trap cleanup EXIT
|
# cleanup — remove temp files (NOT lockfile — cleanup_on_exit handles that)
|
||||||
|
cleanup() {
|
||||||
|
rm -rf "$REVIEW_TMPDIR" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"
|
||||||
|
}
|
||||||
|
|
||||||
|
# cleanup_on_exit — defensive cleanup: remove lockfile if we own it, kill residual children
|
||||||
|
# This handles the case where review-pr.sh is terminated unexpectedly (e.g., watchdog SIGTERM)
|
||||||
|
cleanup_on_exit() {
|
||||||
|
local ec=$?
|
||||||
|
# Remove lockfile only if we own it (PID matches $$)
|
||||||
|
if [ -f "$LOCKFILE" ] && [ -n "$(cat "$LOCKFILE" 2>/dev/null)" ]; then
|
||||||
|
if [ "$(cat "$LOCKFILE" 2>/dev/null)" = "$$" ]; then
|
||||||
|
rm -f "$LOCKFILE"
|
||||||
|
log "cleanup_on_exit: removed lockfile (we owned it)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
# Kill any direct children that may have been spawned by this process
|
||||||
|
# (e.g., bash -c commands from Claude's Bash tool that didn't get reaped)
|
||||||
|
pkill -P $$ 2>/dev/null || true
|
||||||
|
# Call the main cleanup function to remove temp files
|
||||||
|
cleanup
|
||||||
|
exit "$ec"
|
||||||
|
}
|
||||||
|
trap cleanup_on_exit EXIT INT TERM
|
||||||
|
|
||||||
|
# Note: EXIT trap is already set above. The cleanup function is still available for
|
||||||
|
# non-error exits (e.g., normal completion via exit 0 after verdict posted).
|
||||||
|
# When review succeeds, we want to skip lockfile removal since the verdict was posted.
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# LOG ROTATION
|
# LOG ROTATION
|
||||||
|
|
@ -104,6 +131,7 @@ if [ "$PR_STATE" != "open" ]; then
|
||||||
log "SKIP: state=${PR_STATE}"
|
log "SKIP: state=${PR_STATE}"
|
||||||
worktree_cleanup "$WORKTREE"
|
worktree_cleanup "$WORKTREE"
|
||||||
rm -f "$OUTPUT_FILE" "$SID_FILE" 2>/dev/null || true
|
rm -f "$OUTPUT_FILE" "$SID_FILE" 2>/dev/null || true
|
||||||
|
rm -f "$LOCKFILE"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
@ -113,7 +141,7 @@ fi
|
||||||
CI_STATE=$(ci_commit_status "$PR_SHA")
|
CI_STATE=$(ci_commit_status "$PR_SHA")
|
||||||
CI_NOTE=""
|
CI_NOTE=""
|
||||||
if ! ci_passed "$CI_STATE"; then
|
if ! ci_passed "$CI_STATE"; then
|
||||||
ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; exit 0; }
|
ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; rm -f "$LOCKFILE"; exit 0; }
|
||||||
CI_NOTE=" (not required — non-code PR)"
|
CI_NOTE=" (not required — non-code PR)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
@ -123,10 +151,10 @@ fi
|
||||||
ALL_COMMENTS=$(forge_api_all "/issues/${PR_NUMBER}/comments")
|
ALL_COMMENTS=$(forge_api_all "/issues/${PR_NUMBER}/comments")
|
||||||
HAS_CMT=$(printf '%s' "$ALL_COMMENTS" | jq --arg s "$PR_SHA" \
|
HAS_CMT=$(printf '%s' "$ALL_COMMENTS" | jq --arg s "$PR_SHA" \
|
||||||
'[.[]|select(.body|contains("<!-- reviewed: "+$s+" -->"))]|length')
|
'[.[]|select(.body|contains("<!-- reviewed: "+$s+" -->"))]|length')
|
||||||
[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; exit 0; }
|
[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; rm -f "$LOCKFILE"; exit 0; }
|
||||||
HAS_FML=$(forge_api_all "/pulls/${PR_NUMBER}/reviews" | jq --arg s "$PR_SHA" \
|
HAS_FML=$(forge_api_all "/pulls/${PR_NUMBER}/reviews" | jq --arg s "$PR_SHA" \
|
||||||
'[.[]|select(.commit_id==$s)|select(.state!="COMMENT")]|length')
|
'[.[]|select(.commit_id==$s)|select(.state!="COMMENT")]|length')
|
||||||
[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; exit 0; }
|
[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; rm -f "$LOCKFILE"; exit 0; }
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# RE-REVIEW DETECTION
|
# RE-REVIEW DETECTION
|
||||||
|
|
@ -324,3 +352,7 @@ esac
|
||||||
profile_write_journal "review-${PR_NUMBER}" "Review PR #${PR_NUMBER} (${VERDICT})" "${VERDICT,,}" "" || true
|
profile_write_journal "review-${PR_NUMBER}" "Review PR #${PR_NUMBER} (${VERDICT})" "${VERDICT,,}" "" || true
|
||||||
|
|
||||||
log "DONE: ${VERDICT} (re-review: ${IS_RE_REVIEW})"
|
log "DONE: ${VERDICT} (re-review: ${IS_RE_REVIEW})"
|
||||||
|
|
||||||
|
# Remove lockfile on successful completion (cleanup_on_exit will also do this,
|
||||||
|
# but we do it here to avoid the trap running twice)
|
||||||
|
rm -f "$LOCKFILE"
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# Supervisor Agent
|
# Supervisor Agent
|
||||||
|
|
||||||
**Role**: Health monitoring and auto-remediation, executed as a formula-driven
|
**Role**: Health monitoring and auto-remediation, executed as a formula-driven
|
||||||
|
|
|
||||||
|
|
@ -426,3 +426,19 @@ setup_file() {
|
||||||
[[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent,agents"* ]]
|
[[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent,agents"* ]]
|
||||||
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents"* ]]
|
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents"* ]]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# S5.1 / #1035 — edge service seeds ops-repo (dispatcher FORGE_TOKEN)
|
||||||
|
@test "disinto init --backend=nomad --with edge deploys edge" {
|
||||||
|
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with edge --dry-run
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
# edge depends on all backend services, so all are included
|
||||||
|
[[ "$output" == *"services to deploy: edge,forgejo"* ]]
|
||||||
|
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents staging chat edge"* ]]
|
||||||
|
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"edge.hcl"* ]]
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "disinto init --backend=nomad --with edge seeds ops-repo" {
|
||||||
|
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with edge --dry-run
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[[ "$output" == *"tools/vault-seed-ops-repo.sh --dry-run"* ]]
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
FACTORY_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
FACTORY_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||||
|
export FACTORY_ROOT_REAL="$FACTORY_ROOT"
|
||||||
# Always use localhost for mock Forgejo (in case FORGE_URL is set from docker-compose)
|
# Always use localhost for mock Forgejo (in case FORGE_URL is set from docker-compose)
|
||||||
export FORGE_URL="http://localhost:3000"
|
export FORGE_URL="http://localhost:3000"
|
||||||
MOCK_BIN="/tmp/smoke-mock-bin"
|
MOCK_BIN="/tmp/smoke-mock-bin"
|
||||||
|
|
@ -30,7 +31,8 @@ cleanup() {
|
||||||
rm -rf "$MOCK_BIN" /tmp/smoke-test-repo \
|
rm -rf "$MOCK_BIN" /tmp/smoke-test-repo \
|
||||||
"${FACTORY_ROOT}/projects/smoke-repo.toml" \
|
"${FACTORY_ROOT}/projects/smoke-repo.toml" \
|
||||||
/tmp/smoke-claude-shared /tmp/smoke-home-claude \
|
/tmp/smoke-claude-shared /tmp/smoke-home-claude \
|
||||||
/tmp/smoke-env-before-rerun /tmp/smoke-env-before-dryrun
|
/tmp/smoke-env-before-rerun /tmp/smoke-env-before-dryrun \
|
||||||
|
"${FACTORY_ROOT}/docker-compose.yml"
|
||||||
# Restore .env only if we created the backup
|
# Restore .env only if we created the backup
|
||||||
if [ -f "${FACTORY_ROOT}/.env.smoke-backup" ]; then
|
if [ -f "${FACTORY_ROOT}/.env.smoke-backup" ]; then
|
||||||
mv "${FACTORY_ROOT}/.env.smoke-backup" "${FACTORY_ROOT}/.env"
|
mv "${FACTORY_ROOT}/.env.smoke-backup" "${FACTORY_ROOT}/.env"
|
||||||
|
|
@ -423,6 +425,51 @@ export CLAUDE_SHARED_DIR="$ORIG_CLAUDE_SHARED_DIR"
|
||||||
export CLAUDE_CONFIG_DIR="$ORIG_CLAUDE_CONFIG_DIR"
|
export CLAUDE_CONFIG_DIR="$ORIG_CLAUDE_CONFIG_DIR"
|
||||||
rm -rf /tmp/smoke-claude-shared /tmp/smoke-home-claude
|
rm -rf /tmp/smoke-claude-shared /tmp/smoke-home-claude
|
||||||
|
|
||||||
|
# ── 8. Test duplicate service name detection ──────────────────────────────
|
||||||
|
echo "=== 8/8 Testing duplicate service name detection ==="
|
||||||
|
|
||||||
|
# Isolated factory root — do NOT touch the real ${FACTORY_ROOT}/projects/
|
||||||
|
SMOKE_DUP_ROOT=$(mktemp -d)
|
||||||
|
mkdir -p "$SMOKE_DUP_ROOT/projects"
|
||||||
|
cat > "$SMOKE_DUP_ROOT/projects/duplicate-test.toml" <<'TOMLEOF'
|
||||||
|
name = "duplicate-test"
|
||||||
|
description = "dup-detection smoke"
|
||||||
|
|
||||||
|
[ci]
|
||||||
|
woodpecker_repo_id = "999"
|
||||||
|
|
||||||
|
[agents.llama]
|
||||||
|
base_url = "http://localhost:8080"
|
||||||
|
model = "qwen:latest"
|
||||||
|
roles = ["dev"]
|
||||||
|
forge_user = "llama-bot"
|
||||||
|
TOMLEOF
|
||||||
|
|
||||||
|
# Call the generator directly — no `disinto init` to overwrite the TOML.
|
||||||
|
# FACTORY_ROOT tells generators.sh where projects/ + compose_file live.
|
||||||
|
(
|
||||||
|
export FACTORY_ROOT="$SMOKE_DUP_ROOT"
|
||||||
|
export ENABLE_LLAMA_AGENT=1
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "${FACTORY_ROOT_REAL:-$(cd "$(dirname "$0")/.." && pwd)}/lib/generators.sh"
|
||||||
|
# Use a temp file to capture output since pipefail will kill the pipeline
|
||||||
|
# when _generate_compose_impl returns non-zero
|
||||||
|
_generate_compose_impl > /tmp/smoke-dup-output.txt 2>&1 || true
|
||||||
|
if grep -q "Duplicate service name" /tmp/smoke-dup-output.txt; then
|
||||||
|
pass "Duplicate service detection: conflict between ENABLE_LLAMA_AGENT and [agents.llama] reported"
|
||||||
|
rm -f /tmp/smoke-dup-output.txt
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
fail "Duplicate service detection: no error raised for ENABLE_LLAMA_AGENT + [agents.llama]"
|
||||||
|
cat /tmp/smoke-dup-output.txt >&2
|
||||||
|
rm -f /tmp/smoke-dup-output.txt
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
) || FAILED=1
|
||||||
|
|
||||||
|
rm -rf "$SMOKE_DUP_ROOT"
|
||||||
|
unset ENABLE_LLAMA_AGENT
|
||||||
|
|
||||||
# ── Summary ──────────────────────────────────────────────────────────────────
|
# ── Summary ──────────────────────────────────────────────────────────────────
|
||||||
echo ""
|
echo ""
|
||||||
if [ "$FAILED" -ne 0 ]; then
|
if [ "$FAILED" -ne 0 ]; then
|
||||||
|
|
|
||||||
210
tests/test-duplicate-service-detection.sh
Executable file
210
tests/test-duplicate-service-detection.sh
Executable file
|
|
@ -0,0 +1,210 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# tests/test-duplicate-service-detection.sh — Unit test for duplicate service detection
|
||||||
|
#
|
||||||
|
# Tests that the compose generator correctly detects duplicate service names
|
||||||
|
# between ENABLE_LLAMA_AGENT=1 and [agents.llama] TOML configuration.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Get the absolute path to the disinto root
|
||||||
|
DISINTO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
TEST_DIR=$(mktemp -d)
|
||||||
|
trap "rm -rf \"\$TEST_DIR\"" EXIT
|
||||||
|
|
||||||
|
FAILED=0
|
||||||
|
|
||||||
|
fail() { printf 'FAIL: %s\n' "$*" >&2; FAILED=1; }
|
||||||
|
pass() { printf 'PASS: %s\n' "$*"; }
|
||||||
|
|
||||||
|
# Test 1: Duplicate between ENABLE_LLAMA_AGENT and [agents.llama]
|
||||||
|
echo "=== Test 1: Duplicate between ENABLE_LLAMA_AGENT and [agents.llama] ==="
|
||||||
|
|
||||||
|
# Create projects directory and test project TOML with an agent named "llama"
|
||||||
|
mkdir -p "${TEST_DIR}/projects"
|
||||||
|
cat > "${TEST_DIR}/projects/test-project.toml" <<'TOMLEOF'
|
||||||
|
name = "test-project"
|
||||||
|
description = "Test project for duplicate detection"
|
||||||
|
|
||||||
|
[ci]
|
||||||
|
woodpecker_repo_id = "123"
|
||||||
|
|
||||||
|
[agents.llama]
|
||||||
|
base_url = "http://localhost:8080"
|
||||||
|
model = "qwen:latest"
|
||||||
|
roles = ["dev"]
|
||||||
|
forge_user = "llama-bot"
|
||||||
|
TOMLEOF
|
||||||
|
|
||||||
|
# Create a minimal compose file
|
||||||
|
cat > "${TEST_DIR}/docker-compose.yml" <<'COMPOSEEOF'
|
||||||
|
# Test compose file
|
||||||
|
services:
|
||||||
|
agents:
|
||||||
|
image: test:latest
|
||||||
|
command: echo "hello"
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
test-data:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
test-net:
|
||||||
|
COMPOSEEOF
|
||||||
|
|
||||||
|
# Set up the test environment
|
||||||
|
export FACTORY_ROOT="${TEST_DIR}"
|
||||||
|
export PROJECT_NAME="test-project"
|
||||||
|
export ENABLE_LLAMA_AGENT="1"
|
||||||
|
export FORGE_TOKEN=""
|
||||||
|
export FORGE_PASS=""
|
||||||
|
export CLAUDE_TIMEOUT="7200"
|
||||||
|
export POLL_INTERVAL="300"
|
||||||
|
export GARDENER_INTERVAL="21600"
|
||||||
|
export ARCHITECT_INTERVAL="21600"
|
||||||
|
export PLANNER_INTERVAL="43200"
|
||||||
|
export SUPERVISOR_INTERVAL="1200"
|
||||||
|
|
||||||
|
# Source the generators module and run the compose generator directly
|
||||||
|
source "${DISINTO_ROOT}/lib/generators.sh"
|
||||||
|
|
||||||
|
# Delete the compose file to force regeneration
|
||||||
|
rm -f "${TEST_DIR}/docker-compose.yml"
|
||||||
|
|
||||||
|
# Run the compose generator directly
|
||||||
|
if _generate_compose_impl 3000 false 2>&1 | tee "${TEST_DIR}/output.txt"; then
|
||||||
|
# Check if the output contains the duplicate error message
|
||||||
|
if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output.txt"; then
|
||||||
|
pass "Duplicate detection: correctly detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]"
|
||||||
|
else
|
||||||
|
fail "Duplicate detection: should have detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]"
|
||||||
|
cat "${TEST_DIR}/output.txt" >&2
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# Generator should fail with non-zero exit code
|
||||||
|
if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output.txt"; then
|
||||||
|
pass "Duplicate detection: correctly detected conflict and returned non-zero exit code"
|
||||||
|
else
|
||||||
|
fail "Duplicate detection: should have failed with duplicate error"
|
||||||
|
cat "${TEST_DIR}/output.txt" >&2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 2: No duplicate when only ENABLE_LLAMA_AGENT is set (no conflicting TOML)
|
||||||
|
echo ""
|
||||||
|
echo "=== Test 2: No duplicate when only ENABLE_LLAMA_AGENT is set ==="
|
||||||
|
|
||||||
|
# Remove the projects directory created in Test 1
|
||||||
|
rm -rf "${TEST_DIR}/projects"
|
||||||
|
|
||||||
|
# Create a fresh compose file
|
||||||
|
cat > "${TEST_DIR}/docker-compose.yml" <<'COMPOSEEOF'
|
||||||
|
# Test compose file
|
||||||
|
services:
|
||||||
|
agents:
|
||||||
|
image: test:latest
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
test-data:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
test-net:
|
||||||
|
COMPOSEEOF
|
||||||
|
|
||||||
|
# Set ENABLE_LLAMA_AGENT
|
||||||
|
export ENABLE_LLAMA_AGENT="1"
|
||||||
|
|
||||||
|
# Delete the compose file to force regeneration
|
||||||
|
rm -f "${TEST_DIR}/docker-compose.yml"
|
||||||
|
|
||||||
|
if _generate_compose_impl 3000 false 2>&1 | tee "${TEST_DIR}/output2.txt"; then
|
||||||
|
if grep -q "Duplicate" "${TEST_DIR}/output2.txt"; then
|
||||||
|
fail "No duplicate: should not detect duplicate when only ENABLE_LLAMA_AGENT is set"
|
||||||
|
else
|
||||||
|
pass "No duplicate: correctly generated compose without duplicates"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# Non-zero exit is fine if there's a legitimate reason (e.g., missing files)
|
||||||
|
if grep -q "Duplicate" "${TEST_DIR}/output2.txt"; then
|
||||||
|
fail "No duplicate: should not detect duplicate when only ENABLE_LLAMA_AGENT is set"
|
||||||
|
else
|
||||||
|
pass "No duplicate: generator failed for other reason (acceptable)"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test 3: Duplicate between two TOML agents with same name
|
||||||
|
echo ""
|
||||||
|
echo "=== Test 3: Duplicate between two TOML agents with same name ==="
|
||||||
|
|
||||||
|
rm -f "${TEST_DIR}/docker-compose.yml"
|
||||||
|
|
||||||
|
# Create projects directory for Test 3
|
||||||
|
mkdir -p "${TEST_DIR}/projects"
|
||||||
|
|
||||||
|
cat > "${TEST_DIR}/projects/project1.toml" <<'TOMLEOF'
|
||||||
|
name = "project1"
|
||||||
|
description = "First project"
|
||||||
|
|
||||||
|
[ci]
|
||||||
|
woodpecker_repo_id = "1"
|
||||||
|
|
||||||
|
[agents.llama]
|
||||||
|
base_url = "http://localhost:8080"
|
||||||
|
model = "qwen:latest"
|
||||||
|
roles = ["dev"]
|
||||||
|
forge_user = "llama-bot1"
|
||||||
|
TOMLEOF
|
||||||
|
|
||||||
|
cat > "${TEST_DIR}/projects/project2.toml" <<'TOMLEOF'
|
||||||
|
name = "project2"
|
||||||
|
description = "Second project"
|
||||||
|
|
||||||
|
[ci]
|
||||||
|
woodpecker_repo_id = "2"
|
||||||
|
|
||||||
|
[agents.llama]
|
||||||
|
base_url = "http://localhost:8080"
|
||||||
|
model = "qwen:latest"
|
||||||
|
roles = ["dev"]
|
||||||
|
forge_user = "llama-bot2"
|
||||||
|
TOMLEOF
|
||||||
|
|
||||||
|
cat > "${TEST_DIR}/docker-compose.yml" <<'COMPOSEEOF'
|
||||||
|
# Test compose file
|
||||||
|
services:
|
||||||
|
agents:
|
||||||
|
image: test:latest
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
test-data:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
test-net:
|
||||||
|
COMPOSEEOF
|
||||||
|
|
||||||
|
unset ENABLE_LLAMA_AGENT
|
||||||
|
|
||||||
|
# Delete the compose file to force regeneration
|
||||||
|
rm -f "${TEST_DIR}/docker-compose.yml"
|
||||||
|
|
||||||
|
if _generate_compose_impl 3000 false 2>&1 | tee "${TEST_DIR}/output3.txt"; then
|
||||||
|
if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output3.txt"; then
|
||||||
|
pass "Duplicate detection: correctly detected conflict between two [agents.llama] blocks"
|
||||||
|
else
|
||||||
|
fail "Duplicate detection: should have detected conflict between two [agents.llama] blocks"
|
||||||
|
cat "${TEST_DIR}/output3.txt" >&2
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if grep -q "Duplicate service name 'agents-llama'" "${TEST_DIR}/output3.txt"; then
|
||||||
|
pass "Duplicate detection: correctly detected conflict and returned non-zero exit code"
|
||||||
|
else
|
||||||
|
fail "Duplicate detection: should have failed with duplicate error"
|
||||||
|
cat "${TEST_DIR}/output3.txt" >&2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
echo ""
|
||||||
|
if [ "$FAILED" -ne 0 ]; then
|
||||||
|
echo "=== TESTS FAILED ==="
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "=== ALL TESTS PASSED ==="
|
||||||
129
tests/test-watchdog-process-group.sh
Executable file
129
tests/test-watchdog-process-group.sh
Executable file
|
|
@ -0,0 +1,129 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# test-watchdog-process-group.sh — Test that claude_run_with_watchdog kills orphan children
|
||||||
|
#
|
||||||
|
# This test verifies that when claude_run_with_watchdog terminates the Claude process,
|
||||||
|
# all child processes (including those spawned by Claude's Bash tool) are also killed.
|
||||||
|
#
|
||||||
|
# Reproducer scenario:
|
||||||
|
# 1. Create a fake "claude" stub that:
|
||||||
|
# a. Spawns a long-running child process (sleep 3600)
|
||||||
|
# b. Writes a result marker to stdout to trigger idle detection
|
||||||
|
# c. Stays running
|
||||||
|
# 2. Run claude_run_with_watchdog with the stub
|
||||||
|
# 3. Before the fix: sleep child survives (orphaned to PID 1)
|
||||||
|
# 4. After the fix: sleep child dies (killed as part of process group with -PID)
|
||||||
|
#
|
||||||
|
# Usage: ./tests/test-watchdog-process-group.sh
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
||||||
|
TEST_TMP="/tmp/test-watchdog-$$"
|
||||||
|
LOGFILE="${TEST_TMP}/log.txt"
|
||||||
|
PASS=true
|
||||||
|
|
||||||
|
# shellcheck disable=SC2317
|
||||||
|
cleanup_test() {
|
||||||
|
rm -rf "$TEST_TMP"
|
||||||
|
}
|
||||||
|
trap cleanup_test EXIT INT TERM
|
||||||
|
|
||||||
|
mkdir -p "$TEST_TMP"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
printf '[TEST] %s\n' "$*" | tee -a "$LOGFILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
fail() {
|
||||||
|
printf '[TEST] FAIL: %s\n' "$*" | tee -a "$LOGFILE"
|
||||||
|
PASS=false
|
||||||
|
}
|
||||||
|
|
||||||
|
pass() {
|
||||||
|
printf '[TEST] PASS: %s\n' "$*" | tee -a "$LOGFILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Export required environment variables
|
||||||
|
export CLAUDE_TIMEOUT=10 # Short timeout for testing
|
||||||
|
export CLAUDE_IDLE_GRACE=2 # Short grace period for testing
|
||||||
|
export LOGFILE="${LOGFILE}" # Required by agent-sdk.sh
|
||||||
|
|
||||||
|
# Create a fake claude stub that:
|
||||||
|
# 1. Spawns a long-running child process (sleep 3600) that will become an orphan if parent is killed
|
||||||
|
# 2. Writes a result marker to stdout (to trigger the watchdog's idle-after-result path)
|
||||||
|
# 3. Stays running so the watchdog can kill it
|
||||||
|
cat > "${TEST_TMP}/fake-claude" << 'FAKE_CLAUDE_EOF'
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# Fake claude that spawns a child and stays running
|
||||||
|
# Simulates Claude's behavior when it spawns a Bash tool command
|
||||||
|
|
||||||
|
# Write result marker to stdout (triggers watchdog idle detection)
|
||||||
|
echo '{"type":"result","session_id":"test-session-123","verdict":"APPROVE"}'
|
||||||
|
|
||||||
|
# Spawn a child that simulates Claude's Bash tool hanging
|
||||||
|
# This is the process that should be killed when the parent is terminated
|
||||||
|
sleep 3600 &
|
||||||
|
CHILD_PID=$!
|
||||||
|
|
||||||
|
# Log the child PID for debugging
|
||||||
|
echo "FAKE_CLAUDE_CHILD_PID=$CHILD_PID" >&2
|
||||||
|
|
||||||
|
# Stay running - sleep in a loop so the watchdog can kill us
|
||||||
|
while true; do
|
||||||
|
sleep 3600 &
|
||||||
|
wait $! 2>/dev/null || true
|
||||||
|
done
|
||||||
|
FAKE_CLAUDE_EOF
|
||||||
|
chmod +x "${TEST_TMP}/fake-claude"
|
||||||
|
|
||||||
|
log "Testing claude_run_with_watchdog process group cleanup..."
|
||||||
|
|
||||||
|
# Source the library and run claude_run_with_watchdog
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
source lib/agent-sdk.sh
|
||||||
|
|
||||||
|
log "Starting claude_run_with_watchdog with fake claude..."
|
||||||
|
|
||||||
|
# Run the function directly (not as a script)
|
||||||
|
# We need to capture output and redirect stderr
|
||||||
|
OUTPUT_FILE="${TEST_TMP}/output.txt"
|
||||||
|
timeout 35 bash -c "
|
||||||
|
source '${SCRIPT_DIR}/lib/agent-sdk.sh'
|
||||||
|
CLAUDE_TIMEOUT=10 CLAUDE_IDLE_GRACE=2 LOGFILE='${LOGFILE}' claude_run_with_watchdog '${TEST_TMP}/fake-claude' > '${OUTPUT_FILE}' 2>&1
|
||||||
|
exit \$?
|
||||||
|
" || true
|
||||||
|
|
||||||
|
# Give the watchdog a moment to clean up
|
||||||
|
log "Waiting for cleanup..."
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# More precise check: look for sleep 3600 processes
|
||||||
|
# These would be the orphans from our fake claude
|
||||||
|
ORPHAN_COUNT=$(pgrep -a sleep 2>/dev/null | grep -c "sleep 3600" 2>/dev/null || echo "0")
|
||||||
|
|
||||||
|
if [ "$ORPHAN_COUNT" -gt 0 ]; then
|
||||||
|
log "Found $ORPHAN_COUNT orphan sleep 3600 processes:"
|
||||||
|
pgrep -a sleep | grep "sleep 3600"
|
||||||
|
fail "Orphan children found - process group cleanup did not work"
|
||||||
|
else
|
||||||
|
pass "No orphan children found - process group cleanup worked"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Also verify that the fake claude itself is not running
|
||||||
|
FAKE_CLAUDE_COUNT=$(pgrep -c -f "fake-claude" 2>/dev/null || echo "0")
|
||||||
|
if [ "$FAKE_CLAUDE_COUNT" -gt 0 ]; then
|
||||||
|
log "Found $FAKE_CLAUDE_COUNT fake-claude processes still running"
|
||||||
|
fail "Fake claude process(es) still running"
|
||||||
|
else
|
||||||
|
pass "Fake claude process terminated"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
echo ""
|
||||||
|
if [ "$PASS" = true ]; then
|
||||||
|
log "All tests passed!"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
log "Some tests failed. See log at $LOGFILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
149
tools/vault-seed-ops-repo.sh
Executable file
149
tools/vault-seed-ops-repo.sh
Executable file
|
|
@ -0,0 +1,149 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# tools/vault-seed-ops-repo.sh — Idempotent seed for kv/disinto/shared/ops-repo
|
||||||
|
#
|
||||||
|
# Part of the Nomad+Vault migration (S5.1, issue #1035). Populates the KV v2
|
||||||
|
# path that nomad/jobs/edge.hcl dispatcher task reads from, so the edge
|
||||||
|
# proxy has FORGE_TOKEN for ops repo access.
|
||||||
|
#
|
||||||
|
# Seeds from kv/disinto/bots/vault (the vault bot credentials) — copies the
|
||||||
|
# token field to kv/disinto/shared/ops-repo. This is the "service" path that
|
||||||
|
# dispatcher uses, distinct from the "agent" path (bots/vault) used by
|
||||||
|
# agent tasks under the service-agents policy.
|
||||||
|
#
|
||||||
|
# Idempotency contract:
|
||||||
|
# - Key present with non-empty value → leave untouched, log "token unchanged".
|
||||||
|
# - Key missing or empty → copy from bots/vault, log "token copied".
|
||||||
|
# - If bots/vault is also empty → generate a random value, log "token generated".
|
||||||
|
#
|
||||||
|
# Preconditions:
|
||||||
|
# - Vault reachable + unsealed at $VAULT_ADDR.
|
||||||
|
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable.
|
||||||
|
# - The `kv/` mount is enabled as KV v2.
|
||||||
|
#
|
||||||
|
# Requires:
|
||||||
|
# - VAULT_ADDR (e.g. http://127.0.0.1:8200)
|
||||||
|
# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh)
|
||||||
|
# - curl, jq, openssl
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# tools/vault-seed-ops-repo.sh
|
||||||
|
# tools/vault-seed-ops-repo.sh --dry-run
|
||||||
|
#
|
||||||
|
# Exit codes:
|
||||||
|
# 0 success (seed applied, or already applied)
|
||||||
|
# 1 precondition / API / mount-mismatch failure
|
||||||
|
# =============================================================================
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||||
|
|
||||||
|
# shellcheck source=../lib/hvault.sh
|
||||||
|
source "${REPO_ROOT}/lib/hvault.sh"
|
||||||
|
|
||||||
|
# KV v2 mount + logical paths
|
||||||
|
KV_MOUNT="kv"
|
||||||
|
OPS_REPO_PATH="disinto/shared/ops-repo"
|
||||||
|
VAULT_BOT_PATH="disinto/bots/vault"
|
||||||
|
|
||||||
|
OPS_REPO_API="${KV_MOUNT}/data/${OPS_REPO_PATH}"
|
||||||
|
VAULT_BOT_API="${KV_MOUNT}/data/${VAULT_BOT_PATH}"
|
||||||
|
|
||||||
|
log() { printf '[vault-seed-ops-repo] %s\n' "$*"; }
|
||||||
|
die() { printf '[vault-seed-ops-repo] ERROR: %s\n' "$*" >&2; exit 1; }
|
||||||
|
|
||||||
|
# ── Flag parsing ─────────────────────────────────────────────────────────────
|
||||||
|
DRY_RUN=0
|
||||||
|
case "$#:${1-}" in
|
||||||
|
0:)
|
||||||
|
;;
|
||||||
|
1:--dry-run)
|
||||||
|
DRY_RUN=1
|
||||||
|
;;
|
||||||
|
1:-h|1:--help)
|
||||||
|
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
|
||||||
|
printf 'Seed kv/disinto/shared/ops-repo with FORGE_TOKEN.\n\n'
|
||||||
|
printf 'Copies token from kv/disinto/bots/vault if present;\n'
|
||||||
|
printf 'otherwise generates a random value. Idempotent:\n'
|
||||||
|
printf 'existing non-empty values are left untouched.\n\n'
|
||||||
|
printf ' --dry-run Print planned actions without writing.\n'
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
die "invalid arguments: $* (try --help)"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# ── Preconditions ────────────────────────────────────────────────────────────
|
||||||
|
for bin in curl jq openssl; do
|
||||||
|
command -v "$bin" >/dev/null 2>&1 \
|
||||||
|
|| die "required binary not found: ${bin}"
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${VAULT_ADDR:-}" ] \
|
||||||
|
|| die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200"
|
||||||
|
hvault_token_lookup >/dev/null \
|
||||||
|
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
|
||||||
|
|
||||||
|
# ── Step 1/2: ensure kv/ mount exists and is KV v2 ───────────────────────────
|
||||||
|
log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──"
|
||||||
|
export DRY_RUN
|
||||||
|
hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-ops-repo]" \
|
||||||
|
|| die "KV mount check failed"
|
||||||
|
|
||||||
|
# ── Step 2/2: seed ops-repo from vault bot ───────────────────────────────────
|
||||||
|
log "── Step 2/2: seed ${OPS_REPO_API} ──"
|
||||||
|
|
||||||
|
# Read existing ops-repo value
|
||||||
|
existing_raw="$(hvault_get_or_empty "${OPS_REPO_API}")" \
|
||||||
|
|| die "failed to read ${OPS_REPO_API}"
|
||||||
|
|
||||||
|
existing_token=""
|
||||||
|
if [ -n "$existing_raw" ]; then
|
||||||
|
existing_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.token // ""')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
desired_token="$existing_token"
|
||||||
|
action=""
|
||||||
|
|
||||||
|
if [ -z "$existing_token" ]; then
|
||||||
|
# Token missing — try to copy from vault bot
|
||||||
|
bot_raw="$(hvault_get_or_empty "${VAULT_BOT_API}")" || true
|
||||||
|
if [ -n "$bot_raw" ]; then
|
||||||
|
bot_token="$(printf '%s' "$bot_raw" | jq -r '.data.data.token // ""')"
|
||||||
|
if [ -n "$bot_token" ]; then
|
||||||
|
desired_token="$bot_token"
|
||||||
|
action="copied"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If still no token, generate one
|
||||||
|
if [ -z "$desired_token" ]; then
|
||||||
|
if [ "$DRY_RUN" -eq 1 ]; then
|
||||||
|
action="generated (dry-run)"
|
||||||
|
else
|
||||||
|
desired_token="$(openssl rand -hex 32)"
|
||||||
|
action="generated"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$action" ]; then
|
||||||
|
log "all keys present at ${OPS_REPO_API} — no-op"
|
||||||
|
log "token unchanged"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$DRY_RUN" -eq 1 ]; then
|
||||||
|
log "[dry-run] ${OPS_REPO_PATH}: would ${action} token"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Write the token
|
||||||
|
payload="$(jq -n --arg t "$desired_token" '{data: {token: $t}}')"
|
||||||
|
_hvault_request POST "${OPS_REPO_API}" "$payload" >/dev/null \
|
||||||
|
|| die "failed to write ${OPS_REPO_API}"
|
||||||
|
|
||||||
|
log "${OPS_REPO_PATH}: ${action} token"
|
||||||
|
log "done — ${OPS_REPO_API} seeded"
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
|
<!-- last-reviewed: 5ba18c8f80da6e3e574823e39e5aa760731c1705 -->
|
||||||
# vault/policies/ — Agent Instructions
|
# vault/policies/ — Agent Instructions
|
||||||
|
|
||||||
HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per
|
HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue