diff --git a/.woodpecker/detect-duplicates.py b/.woodpecker/detect-duplicates.py index 9c87b1d..b96ab19 100644 --- a/.woodpecker/detect-duplicates.py +++ b/.woodpecker/detect-duplicates.py @@ -294,10 +294,6 @@ def main() -> int: "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern", # Standard lib source block shared across formula-driven agent run scripts "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)", - # Test data for duplicate service detection tests (#850) - # Intentionally duplicated TOML blocks in smoke-init.sh and test-duplicate-service-detection.sh - "334967b8b4f1a8d3b0b9b8e0912f3bfb": "Test TOML: [agents.llama] block header (smoke-init.sh + test-duplicate-service-detection.sh)", - "d82f30077e5bb23b5fc01db003033d5d": "Test TOML: [agents.llama] block body (smoke-init.sh + test-duplicate-service-detection.sh)", # Common vault-seed script patterns: logging helpers + flag parsing # Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh "843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)", @@ -315,6 +311,10 @@ def main() -> int: # Common vault-seed script flag parsing patterns # Shared across tools/vault-seed-{forgejo,ops-repo}.sh "6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)", + # Test data for duplicate service detection tests (#850) + # Intentionally duplicated TOML blocks in smoke-init.sh and test-duplicate-service-detection.sh + "334967b8b4f1a8d3b0b9b8e0912f3bfb": "Test TOML: [agents.llama] block header (smoke-init.sh + test-duplicate-service-detection.sh)", + "d82f30077e5bb23b5fc01db003033d5d": "Test TOML: [agents.llama] block body (smoke-init.sh + test-duplicate-service-detection.sh)", "a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)", "e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)", "c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)", diff --git a/AGENTS.md b/AGENTS.md index c335aae..9c42667 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ - + # Disinto — Agent Instructions ## What this repo is diff --git a/architect/AGENTS.md b/architect/AGENTS.md index 99eebc9..7286ee3 100644 --- a/architect/AGENTS.md +++ b/architect/AGENTS.md @@ -1,4 +1,4 @@ - + # Architect — Agent Instructions ## What this agent is diff --git a/bin/disinto b/bin/disinto index 05e766f..7f6379d 100755 --- a/bin/disinto +++ b/bin/disinto @@ -12,7 +12,6 @@ # disinto secrets Manage encrypted secrets # disinto run Run action in ephemeral runner container # disinto ci-logs [--step ] Read CI logs from Woodpecker SQLite -# disinto backup create Export factory state for migration # # Usage: # disinto init https://github.com/user/repo @@ -40,9 +39,7 @@ source "${FACTORY_ROOT}/lib/generators.sh" source "${FACTORY_ROOT}/lib/forge-push.sh" source "${FACTORY_ROOT}/lib/ci-setup.sh" source "${FACTORY_ROOT}/lib/release.sh" -source "${FACTORY_ROOT}/lib/backup.sh" source "${FACTORY_ROOT}/lib/claude-config.sh" -source "${FACTORY_ROOT}/lib/disinto/backup.sh" # backup create/import # ── Helpers ────────────────────────────────────────────────────────────────── @@ -65,9 +62,7 @@ Usage: disinto hire-an-agent [--formula ] [--local-model ] [--model ] Hire a new agent (create user + .profile repo; re-run to rotate credentials) disinto agent Manage agent state (enable/disable) - disinto backup create Export factory state (issues + ops bundle) disinto edge [options] Manage edge tunnel registrations - disinto backup Backup and restore factory state Edge subcommands: register [project] Register a new tunnel (generates keypair if needed) @@ -106,18 +101,6 @@ Hire an agent options: CI logs options: --step Filter logs to a specific step (e.g., smoke-init) - -Backup subcommands: - create Create backup of factory state to tarball - import Restore factory state from backup tarball - -Import behavior: - - Unpacks tarball to temp directory - - Creates disinto repo via Forgejo API (mirror config is manual) - - Creates disinto-ops repo and pushes refs from bundle - - Imports issues from issues/*.json (idempotent - skips existing) - - Logs issue number mapping (Forgejo auto-assigns numbers) - - Prints summary: created X repos, pushed Y refs, imported Z issues, skipped W EOF exit 1 } @@ -2910,33 +2893,6 @@ EOF esac } -# ── backup command ──────────────────────────────────────────────────────────── -# Usage: disinto backup [args] -# Subcommands: -# create Create backup of factory state -# import Restore factory state from backup -disinto_backup() { - local subcmd="${1:-}" - shift || true - - case "$subcmd" in - create) - backup_create "$@" - ;; - import) - backup_import "$@" - ;; - *) - echo "Usage: disinto backup [args]" >&2 - echo "" >&2 - echo "Subcommands:" >&2 - echo " create Create backup of factory state" >&2 - echo " import Restore factory state from backup" >&2 - exit 1 - ;; - esac -} - # ── Main dispatch ──────────────────────────────────────────────────────────── case "${1:-}" in @@ -2953,7 +2909,6 @@ case "${1:-}" in hire-an-agent) shift; disinto_hire_an_agent "$@" ;; agent) shift; disinto_agent "$@" ;; edge) shift; disinto_edge "$@" ;; - backup) shift; disinto_backup "$@" ;; -h|--help) usage ;; *) usage ;; esac diff --git a/dev/AGENTS.md b/dev/AGENTS.md index 867d654..c64551f 100644 --- a/dev/AGENTS.md +++ b/dev/AGENTS.md @@ -1,4 +1,4 @@ - + # Dev Agent **Role**: Implement issues autonomously — write code, push branches, address diff --git a/docker/edge/entrypoint-edge.sh b/docker/edge/entrypoint-edge.sh index 83131fb..6db96b7 100755 --- a/docker/edge/entrypoint-edge.sh +++ b/docker/edge/entrypoint-edge.sh @@ -173,15 +173,11 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}" sleep 1200 # 20 minutes done) & -# ── Load optional secrets from secrets/*.enc (#777) ──────────────────── -# Engagement collection (collect-engagement.sh) requires CADDY_ secrets to -# SCP access logs from a remote edge host. When age key or secrets dir is -# missing, or any secret fails to decrypt, log a warning and skip the cron. -# Caddy itself does not depend on these secrets. +# ── Load required secrets from secrets/*.enc (#777) ──────────────────── +# Edge container declares its required secrets; missing ones cause a hard fail. _AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt" _SECRETS_DIR="/opt/disinto/secrets" EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG" -EDGE_ENGAGEMENT_READY=0 # Assume not ready until proven otherwise _edge_decrypt_secret() { local enc_path="${_SECRETS_DIR}/${1}.enc" @@ -196,53 +192,47 @@ if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then export "$_secret_name=$_val" done if [ -n "$_missing" ]; then - echo "WARN: required engagement secrets missing from secrets/*.enc:${_missing}" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 - else - echo "edge: loaded required engagement secrets: ${EDGE_REQUIRED_SECRETS}" >&2 - EDGE_ENGAGEMENT_READY=1 + echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2 + echo " Run 'disinto secrets add ' for each missing secret." >&2 + echo " If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2 + exit 1 fi + echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2 else - echo "WARN: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — engagement secrets unavailable" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 + echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2 + echo " Ensure age is installed and secrets/*.enc files are present." >&2 + exit 1 fi # Start daily engagement collection cron loop in background (#745) # Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that # calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777). -# Guarded: only start if EDGE_ENGAGEMENT_READY=1. -if [ "$EDGE_ENGAGEMENT_READY" -eq 1 ]; then - (while true; do - # Calculate seconds until next 23:50 UTC - _now=$(date -u +%s) - _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) - if [ "$_target" -le "$_now" ]; then - _target=$(( _target + 86400 )) - fi - _sleep_secs=$(( _target - _now )) - echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 - sleep "$_sleep_secs" - _fetch_log="/tmp/caddy-access-log-fetch.log" - _ssh_key_file=$(mktemp) - printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" - chmod 0600 "$_ssh_key_file" - scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ - "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ - "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true - rm -f "$_ssh_key_file" - if [ -s "$_fetch_log" ]; then - CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ - | tee -a /opt/disinto-logs/collect-engagement.log || true - else - echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 - fi - rm -f "$_fetch_log" - done) & -else - echo "edge: collect-engagement cron skipped (EDGE_ENGAGEMENT_READY=0)" >&2 -fi +(while true; do + # Calculate seconds until next 23:50 UTC + _now=$(date -u +%s) + _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) + if [ "$_target" -le "$_now" ]; then + _target=$(( _target + 86400 )) + fi + _sleep_secs=$(( _target - _now )) + echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 + sleep "$_sleep_secs" + _fetch_log="/tmp/caddy-access-log-fetch.log" + _ssh_key_file=$(mktemp) + printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" + chmod 0600 "$_ssh_key_file" + scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ + "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true + rm -f "$_ssh_key_file" + if [ -s "$_fetch_log" ]; then + CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ + | tee -a /opt/disinto-logs/collect-engagement.log || true + else + echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 + fi + rm -f "$_fetch_log" +done) & # Nomad template renders Caddyfile to /local/Caddyfile via service discovery; # copy it into the expected location if present (compose uses the mounted path). diff --git a/docs/nomad-cutover-runbook.md b/docs/nomad-cutover-runbook.md deleted file mode 100644 index e0956cc..0000000 --- a/docs/nomad-cutover-runbook.md +++ /dev/null @@ -1,183 +0,0 @@ -# Nomad Cutover Runbook - -End-to-end procedure to cut over the disinto factory from docker-compose on -disinto-dev-box to Nomad on disinto-nomad-box. - -**Target**: disinto-nomad-box (10.10.10.216) becomes production; disinto-dev-box -stays warm for rollback. - -**Downtime budget**: <5 min blue-green flip. - -**Data scope**: Forgejo issues + disinto-ops git bundle only. Everything else is -regenerated or discarded. OAuth secrets are regenerated on fresh init (all -sessions invalidated). - ---- - -## 1. Pre-cutover readiness checklist - -- [ ] Nomad + Vault stack healthy on a fresh wipe+init (step 5 verified) -- [ ] Codeberg mirror current — `git log` parity between dev-box Forgejo and - Codeberg -- [ ] SSH key pair generated for nomad-box, registered on DO edge (see §4.6) -- [ ] Companion tools landed: - - `disinto backup create` (#1057) - - `disinto backup import` (#1058) -- [ ] Backup tarball produced and tested against a scratch LXC (see §3) - ---- - -## 2. Pre-cutover artifact: backup - -On disinto-dev-box: - -```bash -./bin/disinto backup create /tmp/disinto-backup-$(date +%Y%m%d).tar.gz -``` - -Copy the tarball to nomad-box (and optionally to a local workstation for -safekeeping): - -```bash -scp /tmp/disinto-backup-*.tar.gz nomad-box:/tmp/ -``` - ---- - -## 3. Pre-cutover dry-run - -On a throwaway LXC: - -```bash -lxc launch ubuntu:24.04 cutover-dryrun -# inside the container: -disinto init --backend=nomad --import-env .env --with edge -./bin/disinto backup import /tmp/disinto-backup-*.tar.gz -``` - -Verify: - -- Issue count matches source Forgejo -- disinto-ops repo refs match source bundle - -Destroy the LXC once satisfied: - -```bash -lxc delete cutover-dryrun --force -``` - ---- - -## 4. Cutover T-0 (operator executes; <5 min target) - -### 4.1 Stop dev-box services - -```bash -# On disinto-dev-box — stop, do NOT remove volumes (rollback needs them) -docker-compose stop -``` - -### 4.2 Provision nomad-box (if not already done) - -```bash -# On disinto-nomad-box -disinto init --backend=nomad --import-env .env --with edge -``` - -### 4.3 Import backup - -```bash -# On disinto-nomad-box -./bin/disinto backup import /tmp/disinto-backup-*.tar.gz -``` - -### 4.4 Configure Codeberg pull mirror - -Manual, one-time step in the new Forgejo UI: - -1. Create a mirror repository pointing at the Codeberg upstream -2. Confirm initial sync completes - -### 4.5 Claude login - -```bash -# On disinto-nomad-box -claude login -``` - -Set up Anthropic OAuth so agents can authenticate. - -### 4.6 Autossh tunnel swap - -> **Operator step** — cross-host, no dev-agent involvement. Do NOT automate. - -1. Stop the tunnel on dev-box: - ```bash - # On disinto-dev-box - systemctl stop reverse-tunnel - ``` - -2. Copy or regenerate the tunnel unit on nomad-box: - ```bash - # Copy from dev-box, or let init regenerate it - scp dev-box:/etc/systemd/system/reverse-tunnel.service \ - nomad-box:/etc/systemd/system/ - ``` - -3. Register nomad-box's public key on DO edge: - ```bash - # On DO edge box — same restricted-command as the dev-box key - echo "" >> /home/johba/.ssh/authorized_keys - ``` - -4. Start the tunnel on nomad-box: - ```bash - # On disinto-nomad-box - systemctl enable --now reverse-tunnel - ``` - -5. Verify end-to-end: - ```bash - curl https://self.disinto.ai/api/v1/version - # Should return the new box's Forgejo version - ``` - ---- - -## 5. Post-cutover smoke - -- [ ] `curl https://self.disinto.ai` → Forgejo welcome page -- [ ] Create a test PR → Woodpecker pipeline runs → agents assign and work -- [ ] Claude chat login via Forgejo OAuth succeeds - ---- - -## 6. Rollback (if any step 4 gate fails) - -1. Stop the tunnel on nomad-box: - ```bash - systemctl stop reverse-tunnel # on nomad-box - ``` - -2. Restore the tunnel on dev-box: - ```bash - systemctl start reverse-tunnel # on dev-box - ``` - -3. Bring dev-box services back up: - ```bash - docker-compose up -d # on dev-box - ``` - -4. DO Caddy config is unchanged — traffic restores in <5 min. - -5. File a post-mortem issue. Keep nomad-box state intact for debugging. - ---- - -## 7. Post-stable cleanup (T+1 week) - -- `docker-compose down -v` on dev-box -- Archive `/var/lib/docker/volumes/disinto_*` to cold storage -- Delete disinto-dev-box LXC or keep as permanent rollback reserve (operator - decision) diff --git a/gardener/AGENTS.md b/gardener/AGENTS.md index c51faad..5dcd12f 100644 --- a/gardener/AGENTS.md +++ b/gardener/AGENTS.md @@ -1,4 +1,4 @@ - + # Gardener Agent **Role**: Backlog grooming — detect duplicate issues, missing acceptance diff --git a/gardener/pending-actions.json b/gardener/pending-actions.json index 2ae5b96..1dbf2a3 100644 --- a/gardener/pending-actions.json +++ b/gardener/pending-actions.json @@ -1,7 +1,52 @@ [ { - "action": "close", - "issue": 1050, - "reason": "Already implemented by PR #1051 (merged 2026-04-19). lib/pr-lifecycle.sh and lib/ci-helpers.sh updated with per-workflow/per-step CI diagnostics." + "action": "edit_body", + "issue": 1025, + "body": "## Prior art: PR #1033 (open, branch `fix/issue-1025` retained)\n\nFirst attempt by dev-qwen2 (head `f692dd2`). Test script (`tests/smoke-edge-subpath.sh`, 13.8 KB) and pipeline (`.woodpecker/edge-subpath.yml`) both landed and look reasonable, but the **CI harness design is wrong**: the pipeline boots a bare `alpine:3.19` container and runs the smoke script directly against `BASE_URL=http://localhost`, with no stack to test against.\n\n**This is a harness design gap, not a script bug.** The smoke script itself is a reasonable post-deploy tool — the mistake was trying to exercise it as a hermetic CI step.\n\n**Approach (Option 1 — split the work):**\n\nKeep `tests/smoke-edge-subpath.sh` as an out-of-CI post-deploy tool (accepts `BASE_URL` env var). Replace the CI pipeline step that tries to curl a live stack with static checks only: `shellcheck`, `caddy validate` on the generated Caddyfile, and a template-substitution unit test that verifies routing block shape.\n\nBranch `fix/issue-1025` is preserved at `f692dd2` — the smoke script body is reusable; only the pipeline harness needs a rethink.\n\n**Timeline:**\n- 2026-04-19 09:14 — dev-qwen2 last pushed `f692dd2`\n- 3 pipelines (#1378/#1380/#1382) all fail: no service to curl (connection refused)\n\n## Acceptance criteria\n- [ ] `.woodpecker/edge-subpath.yml` pipeline runs `shellcheck` on `tests/smoke-edge-subpath.sh` with no live service curl\n- [ ] `caddy validate` runs on the generated Caddyfile in CI (template-substitution unit test)\n- [ ] A template-substitution test verifies the Caddyfile routing block shape (forge/ci/staging/chat paths)\n- [ ] `tests/smoke-edge-subpath.sh` accepts `BASE_URL` env var for post-deploy staging runs\n- [ ] CI green (no connection-refused failures on Woodpecker)\n\n## Affected files\n- `.woodpecker/edge-subpath.yml` — pipeline config (static checks only, no service curl)\n- `tests/smoke-edge-subpath.sh` — out-of-CI smoke script (reusable from PR #1033)\n\n## Dependencies\n- #1038 should land first to unblock local edge staging runs (optional — CI fix is independent)" + }, + { + "action": "remove_label", + "issue": 1025, + "label": "blocked" + }, + { + "action": "add_label", + "issue": 1025, + "label": "backlog" + }, + { + "action": "edit_body", + "issue": 1038, + "body": "## Problem\n\n`disinto-edge` crashloops on any deployment that has not opted into the age-encrypted secret store (#777), because the edge entrypoint treats four secrets as unconditionally required:\n\n```\nFATAL: age key (/home/agent/.config/sops/age/keys.txt) or secrets dir (/opt/disinto/secrets) not found — cannot load required secrets\n```\n\nObserved on `disinto-dev-box` (container `disinto-edge`, restarting every ~30s), which blocks PR #1033 (edge-subpath smoke test) and any other work that depends on a running edge.\n\n## Root cause\n\n`docker/edge/entrypoint-edge.sh:176-205` requires:\n\n- `~/.config/sops/age/keys.txt`\n- `/opt/disinto/secrets/` with `.enc` files for `CADDY_SSH_KEY`, `CADDY_SSH_HOST`, `CADDY_SSH_USER`, `CADDY_ACCESS_LOG`.\n\nThese four secrets feed exactly one feature: the daily 23:50 UTC `collect-engagement.sh` cron (#745), which SCPs Caddy access logs from a **remote production edge host** for engagement parsing. On a local factory box or any deployment that has not set up a remote edge, this code path has no target — yet its absence kills the whole edge container.\n\n## Fix\n\nMake the secrets block **optional**. When age key or secrets dir is missing, or any of the four CADDY_ secrets fail to decrypt, log a warning and skip the `collect-engagement` cron loop. Caddy itself does not depend on these secrets and should start normally.\n\nThe concrete edit is around lines 176-205 of `docker/edge/entrypoint-edge.sh` — guard the secret-loading block with a check for the age key and secrets dir, set `EDGE_ENGAGEMENT_READY=0` on failure, and skip cron registration when `EDGE_ENGAGEMENT_READY != 1`.\n\n## Acceptance criteria\n- [ ] `docker/edge/entrypoint-edge.sh` loads CADDY_ secrets optionally — missing age key or secrets dir logs a warning and continues, does not FATAL\n- [ ] Caddy starts normally when CADDY_ secrets are absent\n- [ ] `collect-engagement` cron is skipped (not registered) when engagement secrets are unavailable\n- [ ] On deployments WITH secrets configured, behavior is unchanged (collect-engagement cron still fires at 23:50 UTC)\n- [ ] CI green\n\n## Affected files\n- `docker/edge/entrypoint-edge.sh` — lines 176-205, secrets loading block made optional" + }, + { + "action": "remove_label", + "issue": 1038, + "label": "blocked" + }, + { + "action": "add_label", + "issue": 1038, + "label": "backlog" + }, + { + "action": "edit_body", + "issue": 850, + "body": "## Problem\n\nWhen the compose generator emits the same service name twice — e.g. both the legacy `ENABLE_LLAMA_AGENT=1` branch and a matching `[agents.llama]` TOML block produce an `agents-llama:` key — the failure is deferred all the way to `docker compose` YAML parsing:\n\n```\nfailed to parse /home/johba/disinto/docker-compose.yml: yaml: construct errors:\n line 4: line 431: mapping key \"agents-llama\" already defined at line 155\n```\n\nBy then, the user has already paid the cost of: pre-build binary downloads, generator run, Caddyfile regeneration. The only hint about what went wrong is a line number in a generated file. Root cause (dual activation) is not surfaced.\n\n## Fix\n\nAdd a generate-time guard to `lib/generators.sh`:\n\n- After collecting all service blocks to emit, compare the set of service names against duplicates.\n- If a duplicate is detected, abort with a clear message naming both sources of truth (e.g. `\"agents-llama\" emitted twice — from ENABLE_LLAMA_AGENT=1 and from [agents.llama] in projects/disinto.toml; remove one`).\n\n## Prior art: PR #872 (closed, branch `fix/issue-850` retained)\n\ndev-qwen's first attempt (`db009e3`) landed the dup-detection logic in `lib/generators.sh` correctly (unit test `tests/test-duplicate-service-detection.sh` passes all 3 cases), but the smoke test fails on CI.\n\n**Why the smoke test fails:** sections 1-7 of `smoke-init.sh` already run `bin/disinto init`, materializing `docker-compose.yml`. Section 8 re-invokes `bin/disinto init` to verify the dup guard fires — but `_generate_compose_impl` early-returns with `\"Compose: already exists, skipping\"` before reaching the dup-check.\n\n**Suggested fix:** in `tests/smoke-init.sh` section 8 (around line 452, before the second `bin/disinto init` invocation), add:\n\n```bash\nrm -f \"${FACTORY_ROOT}/docker-compose.yml\"\n```\n\nso the generator actually runs and the dup-detection path is exercised. Do **not** hoist the dup-check above the early-return.\n\nThe branch `fix/issue-850` is preserved as a starting point — pick up from `db009e3` and patch the smoke-test cleanup.\n\nRelated: #846.\n\n## Acceptance criteria\n- [ ] `bin/disinto init` with a config that would produce duplicate service names aborts with a clear error message naming both sources (e.g. `ENABLE_LLAMA_AGENT=1` and `[agents.llama]` TOML block)\n- [ ] `tests/smoke-init.sh` section 8 removes `docker-compose.yml` before re-invoking `disinto init` so the dup guard is exercised\n- [ ] Unit test `tests/test-duplicate-service-detection.sh` passes all 3 cases\n- [ ] CI green (smoke-init.sh section 8 no longer skips dup detection)\n\n## Affected files\n- `lib/generators.sh` — duplicate service name check after collecting all service blocks\n- `tests/smoke-init.sh` — section 8: add `rm -f \\${FACTORY_ROOT}/docker-compose.yml` before second `disinto init`" + }, + { + "action": "remove_label", + "issue": 850, + "label": "blocked" + }, + { + "action": "add_label", + "issue": 850, + "label": "backlog" + }, + { + "action": "comment", + "issue": 758, + "body": "This issue is the critical path blocker for #820 (ops repo re-seed) and #982 (collect-engagement commit fix). Both are in the backlog and ready to merge, but cannot run until ops repo branch protection is resolved. Needs admin/human action to change Forgejo branch protection settings on disinto-ops — no code change can unblock this." } ] diff --git a/lib/AGENTS.md b/lib/AGENTS.md index cbeb1dd..b54f5cb 100644 --- a/lib/AGENTS.md +++ b/lib/AGENTS.md @@ -1,4 +1,4 @@ - + # Shared Helpers (`lib/`) All agents source `lib/env.sh` as their first action. Additional helpers are @@ -7,7 +7,7 @@ sourced as needed. | File | What it provides | Sourced by | |---|---|---| | `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`FORGE_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `forge_api()`, `forge_api_all()` (paginates all pages; accepts optional second TOKEN parameter, defaults to `$FORGE_TOKEN`; handles invalid/empty JSON responses gracefully — returns empty on parse error instead of crashing), `woodpecker_api()`, `wpdb()`, `memory_guard()` (skips agent if RAM < threshold), `load_secret()` (secret-source abstraction — see below). Auto-loads project TOML if `PROJECT_TOML` is set. Exports per-agent tokens (`FORGE_PLANNER_TOKEN`, `FORGE_GARDENER_TOKEN`, `FORGE_VAULT_TOKEN`, `FORGE_SUPERVISOR_TOKEN`, `FORGE_PREDICTOR_TOKEN`) — each falls back to `$FORGE_TOKEN` if not set. **Vault-only token guard (AD-006)**: `unset GITHUB_TOKEN CLAWHUB_TOKEN` so agents never hold external-action tokens — only the runner container receives them. **Container note**: when `DISINTO_CONTAINER=1`, `.env` is NOT re-sourced — compose already injects env vars (including `FORGE_URL=http://forgejo:3000`) and re-sourcing would clobber them. **Save/restore scope (#364)**: only `FORGE_URL` is preserved across `.env` re-sourcing (compose injects `http://forgejo:3000`, `.env` has `http://localhost:3000`). `FORGE_TOKEN` is NOT preserved so refreshed tokens in `.env` take effect immediately. **Per-agent token override (#762)**: agent run scripts export `FORGE_TOKEN_OVERRIDE=` BEFORE sourcing `env.sh`; `env.sh` applies this override at lines 98-100, ensuring the correct identity survives any re-sourcing of `env.sh` by nested shells or `claude -p` invocations. **Required env var**: `FORGE_PASS` — bot password for git HTTP push (Forgejo 11.x rejects API tokens for `git push`, #361). **Hard preconditions (#674)**: `USER` and `HOME` must be exported by the entrypoint before sourcing. When `PROJECT_TOML` is set, `PROJECT_REPO_ROOT`, `PRIMARY_BRANCH`, and `OPS_REPO_ROOT` must also be set (by entrypoint or TOML). **`load_secret NAME [DEFAULT]` (#793)**: backend-agnostic secret resolution. Precedence: (1) `/secrets/.env` — Nomad-rendered template, (2) current environment — already set by `.env.enc` / compose, (3) `secrets/.enc` — age-encrypted per-key file (decrypted on demand, cached in process env), (4) DEFAULT or empty. Consumers call `$(load_secret GITHUB_TOKEN)` instead of `${GITHUB_TOKEN}` — identical behavior whether secrets come from Docker compose injection or Nomad Vault templates. | Every agent | -| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `ci_required_for_pr()` — returns 0 if PR has code files (CI required), 1 if non-code only (CI not required). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". `ensure_priority_label()` — looks up (or creates) the `priority` label and returns its ID; caches in `_PRIORITY_LABEL_ID`. `ci_commit_status ` — queries Woodpecker directly for CI state, falls back to forge commit status API. `ci_pipeline_number ` — returns the Woodpecker pipeline number for a commit, falls back to parsing forge status `target_url`. `ci_promote ` — promotes a pipeline to a named Woodpecker environment (vault-gated deployment: vault approves, vault-fire calls this — vault redesign in progress, see #73-#77). `ci_get_logs [--step ]` — reads CI logs from Woodpecker SQLite database via `lib/ci-log-reader.py`; outputs last 200 lines to stdout. Requires mounted woodpecker-data volume at /woodpecker-data. `ci_get_step_logs ` — fetches per-step logs via Woodpecker REST API (`/repos/{id}/logs/{pipeline}/{step_id}`); returns raw log data for a single step. Used by `pr_poll_ci()` to build per-workflow/per-step CI diagnostics (#1051). | dev-poll, review-poll, review-pr | +| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `ci_required_for_pr()` — returns 0 if PR has code files (CI required), 1 if non-code only (CI not required). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". `ensure_priority_label()` — looks up (or creates) the `priority` label and returns its ID; caches in `_PRIORITY_LABEL_ID`. `ci_commit_status ` — queries Woodpecker directly for CI state, falls back to forge commit status API. `ci_pipeline_number ` — returns the Woodpecker pipeline number for a commit, falls back to parsing forge status `target_url`. `ci_promote ` — promotes a pipeline to a named Woodpecker environment (vault-gated deployment: vault approves, vault-fire calls this — vault redesign in progress, see #73-#77). `ci_get_logs [--step ]` — reads CI logs from Woodpecker SQLite database via `lib/ci-log-reader.py`; outputs last 200 lines to stdout. Requires mounted woodpecker-data volume at /woodpecker-data. | dev-poll, review-poll, review-pr | | `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) | | `lib/ci-log-reader.py` | Python tool: reads CI logs from Woodpecker SQLite database. ` [--step ]` — returns last 200 lines from failed steps (or specified step). Used by `ci_get_logs()` in ci-helpers.sh. Requires `WOODPECKER_DATA_DIR` (default: /woodpecker-data). | ci-helpers.sh | | `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `FORGE_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, mirror config, etc.). Also exports `FORGE_REPO_OWNER` (the owner component of `FORGE_REPO`, e.g. `disinto-admin` from `disinto-admin/disinto`). Reads `repo_root` and `ops_repo_root` from the TOML for host-CLI callers. **Container path handling (#674)**: no longer derives `PROJECT_REPO_ROOT` or `OPS_REPO_ROOT` inside the script — container entrypoints export the correct paths before agent scripts source `env.sh`, and the `DISINTO_CONTAINER` guard (line 90) skips TOML overrides when those vars are already set. | env.sh (when `PROJECT_TOML` is set) | @@ -20,7 +20,7 @@ sourced as needed. | `lib/stack-lock.sh` | File-based lock protocol for singleton project stack access. `stack_lock_acquire(holder, project)` — polls until free, breaks stale heartbeats (>10 min old), claims lock. `stack_lock_release(project)` — deletes lock file. `stack_lock_check(project)` — inspect current lock state. `stack_lock_heartbeat(project)` — update heartbeat timestamp (callers must call every 2 min while holding). Lock files at `~/data/locks/-stack.lock`. | docker/edge/dispatcher.sh, reproduce formula | | `lib/tea-helpers.sh` | `tea_file_issue(title, body, labels...)` — create issue via tea CLI with secret scanning; sets `FILED_ISSUE_NUM`. `tea_relabel(issue_num, labels...)` — replace labels using tea's `edit` subcommand (not `label`). `tea_comment(issue_num, body)` — add comment with secret scanning. `tea_close(issue_num)` — close issue. All use `TEA_LOGIN` and `FORGE_REPO` from env.sh. Labels by name (no ID lookup). Tea binary download verified via sha256 checksum. Sourced by env.sh when `tea` binary is available. | env.sh (conditional) | | `lib/worktree.sh` | Reusable git worktree management: `worktree_create(path, branch, [base_ref])` — create worktree, checkout base, fetch submodules. `worktree_recover(path, branch, [remote])` — detect existing worktree, reuse if on correct branch (sets `_WORKTREE_REUSED`), otherwise clean and recreate. `worktree_cleanup(path)` — `git worktree remove --force`, clear Claude Code project cache (`~/.claude/projects/` matching path). `worktree_cleanup_stale([max_age_hours])` — scan `/tmp` for orphaned worktrees older than threshold, skip preserved and active tmux worktrees, prune. `worktree_preserve(path, reason)` — mark worktree as preserved for debugging (writes `.worktree-preserved` marker, skipped by stale cleanup). | dev-agent.sh, supervisor-run.sh, planner-run.sh, predictor-run.sh, gardener-run.sh | -| `lib/pr-lifecycle.sh` | Reusable PR lifecycle library: `pr_create()`, `pr_find_by_branch()`, `pr_poll_ci()`, `pr_poll_review()`, `pr_merge()`, `pr_is_merged()`, `pr_walk_to_merge()`, `build_phase_protocol_prompt()`. `pr_poll_ci()` builds a **per-workflow/per-step CI diagnostics prompt** (#1051): on failure, each failed workflow gets its own section with step name, exit code (annotated with standard meanings for 126/127/128), and step-local log tail (via `ci_get_step_logs`); passing workflows are listed explicitly so agents don't waste fix attempts on them. Falls back to legacy combined-log fetch if per-step API is unavailable. Requires `lib/ci-helpers.sh`. | dev-agent.sh (future) | +| `lib/pr-lifecycle.sh` | Reusable PR lifecycle library: `pr_create()`, `pr_find_by_branch()`, `pr_poll_ci()`, `pr_poll_review()`, `pr_merge()`, `pr_is_merged()`, `pr_walk_to_merge()`, `build_phase_protocol_prompt()`. Requires `lib/ci-helpers.sh`. | dev-agent.sh (future) | | `lib/issue-lifecycle.sh` | Reusable issue lifecycle library: `issue_claim()` (add in-progress, remove backlog), `issue_release()` (remove in-progress, add backlog), `issue_block()` (post diagnostic comment with secret redaction, add blocked label), `issue_close()`, `issue_check_deps()` (parse deps, check transitive closure; sets `_ISSUE_BLOCKED_BY`, `_ISSUE_SUGGESTION`), `issue_suggest_next()` (find next unblocked backlog issue; sets `_ISSUE_NEXT`), `issue_post_refusal()` (structured refusal comment with dedup). Label IDs cached in globals on first lookup. Sources `lib/secret-scan.sh`. | dev-agent.sh (future) | | `lib/action-vault.sh` | **Vault PR helper** — create vault action PRs on ops repo via Forgejo API (works from containers without SSH). `vault_request ` validates TOML (using `validate_vault_action` from `action-vault/vault-env.sh`), creates branch `vault/`, writes `vault/actions/.toml`, creates PR targeting `main` with title `vault: ` and body from context field, returns PR number. Idempotent: if PR exists, returns existing number. **Low-tier bypass**: if the action's `blast_radius` classifies as `low` (via `action-vault/classify.sh`), `vault_request` calls `_vault_commit_direct()` which commits directly to ops `main` using `FORGE_ADMIN_TOKEN` — no PR, no approval wait. Returns `0` (not a PR number) for direct commits. Requires `FORGE_TOKEN`, `FORGE_ADMIN_TOKEN` (low-tier only), `FORGE_URL`, `FORGE_REPO`, `FORGE_OPS_REPO`. Uses the calling agent's own token (saves/restores `FORGE_TOKEN` around sourcing `vault-env.sh`), so approval workflow respects individual agent identities. | dev-agent (vault actions), future vault dispatcher | | `lib/branch-protection.sh` | Branch protection helpers for Forgejo repos. `setup_vault_branch_protection()` — configures admin-only merge protection on main (require 1 approval, restrict merge to admin role, block direct pushes). `setup_profile_branch_protection()` — same protection for `.profile` repos. `verify_branch_protection()` — checks protection is correctly configured. `remove_branch_protection()` — removes protection (cleanup/testing). Handles race condition after initial push: retries with backoff if Forgejo hasn't processed the branch yet. Requires `FORGE_TOKEN`, `FORGE_URL`, `FORGE_OPS_REPO`. | bin/disinto (hire-an-agent) | @@ -30,9 +30,7 @@ sourced as needed. | `lib/git-creds.sh` | Shared git credential helper configuration. `configure_git_creds([HOME_DIR] [RUN_AS_CMD])` — writes a static credential helper script and configures git globally to use password-based HTTP auth (Forgejo 11.x rejects API tokens for `git push`, #361). **Retry on cold boot (#741)**: resolves bot username from `FORGE_TOKEN` with 5 retries (exponential backoff 1-5s); fails loudly and returns 1 if Forgejo is unreachable — never falls back to a wrong hardcoded default (exports `BOT_USER` on success). `repair_baked_cred_urls([--as RUN_AS_CMD] DIR ...)` — rewrites any git remote URLs that have credentials baked in to use clean URLs instead; uses `safe.directory` bypass for root-owned repos (#671). Requires `FORGE_PASS`, `FORGE_URL`, `FORGE_TOKEN`. | entrypoints (agents, edge) | | `lib/ops-setup.sh` | `setup_ops_repo()` — creates ops repo on Forgejo if it doesn't exist, configures bot collaborators, clones/initializes ops repo locally, seeds directory structure (vault, knowledge, evidence, sprints). Evidence subdirectories seeded: engagement/, red-team/, holdout/, evolution/, user-test/. Also seeds sprints/ for architect output. Exports `_ACTUAL_OPS_SLUG`. `migrate_ops_repo(ops_root, [primary_branch])` — idempotent migration helper that seeds missing directories and .gitkeep files on existing ops repos (pre-#407 deployments). | bin/disinto (init) | | `lib/ci-setup.sh` | `_install_cron_impl()` — installs crontab entries for bare-metal deployments (compose mode uses polling loop instead). `_create_forgejo_oauth_app()` — generic helper to create an OAuth2 app on Forgejo (shared by Woodpecker and chat). `_create_woodpecker_oauth_impl()` — creates Woodpecker OAuth2 app (thin wrapper). `_create_chat_oauth_impl()` — creates disinto-chat OAuth2 app, writes `CHAT_OAUTH_CLIENT_ID`/`CHAT_OAUTH_CLIENT_SECRET` to `.env` (#708). `_generate_woodpecker_token_impl()` — auto-generates WOODPECKER_TOKEN via OAuth2 flow. `_activate_woodpecker_repo_impl()` — activates repo in Woodpecker. All gated by `_load_ci_context()` which validates required env vars. | bin/disinto (init) | -| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (**duplicate service detection**: tracks service names during generation, aborts with `ERROR: Duplicate service name '$name' detected` on conflict; detection state is reset between calls so idempotent reinvocation is safe, #850) (uses `codeberg.org/forgejo/forgejo:11.0` tag; `CLAUDE_BIN_DIR` volume mount removed from agents/llama services — only `reproduce` and `edge` still use the host-mounted CLI (#992); adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) | -| `lib/backup.sh` | Factory backup creation. `backup_create ` — exports factory state: fetches all issues (open+closed) from the project and ops repos via Forgejo API, bundles the ops repo as a git bundle, and writes a tarball. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_REPO`, `FORGE_OPS_REPO`, `OPS_REPO_ROOT`. Sourced by `bin/disinto backup create` (#1057). | bin/disinto (backup create) | -| `lib/disinto/backup.sh` | Factory backup restore. `backup_import ` — restores from a backup tarball: creates missing repos via Forgejo API, imports issues (idempotent — skips by number if present), unpacks ops repo git bundle. Idempotent: running twice produces same end state with no errors. Requires `FORGE_URL`, `FORGE_TOKEN`. Sourced by `bin/disinto backup import` (#1058). | bin/disinto (backup import) | +| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (uses `codeberg.org/forgejo/forgejo:11.0` tag; `CLAUDE_BIN_DIR` volume mount removed from agents/llama services — only `reproduce` and `edge` still use the host-mounted CLI (#992); adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) | | `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses ` ... ` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) | | `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) | | `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) | diff --git a/lib/agent-sdk.sh b/lib/agent-sdk.sh index b968222..2522655 100644 --- a/lib/agent-sdk.sh +++ b/lib/agent-sdk.sh @@ -52,9 +52,8 @@ claude_run_with_watchdog() { out_file=$(mktemp) || return 1 trap 'rm -f "$out_file"' RETURN - # Start claude in new process group (setsid creates new session, $pid is PGID leader) - # All children of claude will inherit this process group - setsid "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" & + # Start claude in background, capturing stdout to temp file + "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" & pid=$! # Background watchdog: poll for final result marker @@ -85,12 +84,12 @@ claude_run_with_watchdog() { sleep "$grace" if kill -0 "$pid" 2>/dev/null; then log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM" - kill -TERM -- "-$pid" 2>/dev/null || true + kill -TERM "$pid" 2>/dev/null || true # Give it a moment to clean up sleep 5 if kill -0 "$pid" 2>/dev/null; then log "watchdog: force kill after SIGTERM timeout" - kill -KILL -- "-$pid" 2>/dev/null || true + kill -KILL "$pid" 2>/dev/null || true fi fi fi @@ -101,16 +100,16 @@ claude_run_with_watchdog() { timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null rc=$? - # Clean up the watchdog (target process group if it spawned children) - kill -- "-$grace_pid" 2>/dev/null || true + # Clean up the watchdog + kill "$grace_pid" 2>/dev/null || true wait "$grace_pid" 2>/dev/null || true - # When timeout fires (rc=124), explicitly kill the orphaned claude process group + # When timeout fires (rc=124), explicitly kill the orphaned claude process # tail --pid is a passive waiter, not a supervisor if [ "$rc" -eq 124 ]; then - kill -TERM -- "-$pid" 2>/dev/null || true + kill "$pid" 2>/dev/null || true sleep 1 - kill -KILL -- "-$pid" 2>/dev/null || true + kill -KILL "$pid" 2>/dev/null || true fi # Output the captured stdout diff --git a/lib/backup.sh b/lib/backup.sh deleted file mode 100644 index 8d7a827..0000000 --- a/lib/backup.sh +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# disinto backup — export factory state for migration -# -# Usage: source this file, then call backup_create -# Requires: FORGE_URL, FORGE_TOKEN, FORGE_REPO, FORGE_OPS_REPO, OPS_REPO_ROOT -# ============================================================================= -set -euo pipefail - -# Fetch all issues (open + closed) for a repo slug and emit the normalized JSON array. -# Usage: _backup_fetch_issues -_backup_fetch_issues() { - local repo_slug="$1" - local api_url="${FORGE_API_BASE}/repos/${repo_slug}" - - local all_issues="[]" - for state in open closed; do - local page=1 - while true; do - local page_items - page_items=$(curl -sf -X GET \ - -H "Authorization: token ${FORGE_TOKEN}" \ - -H "Content-Type: application/json" \ - "${api_url}/issues?state=${state}&type=issues&limit=50&page=${page}") || { - echo "ERROR: failed to fetch ${state} issues from ${repo_slug} (page ${page})" >&2 - return 1 - } - local count - count=$(printf '%s' "$page_items" | jq 'length' 2>/dev/null) || count=0 - [ -z "$count" ] && count=0 - [ "$count" -eq 0 ] && break - all_issues=$(printf '%s\n%s' "$all_issues" "$page_items" | jq -s 'add') - [ "$count" -lt 50 ] && break - page=$((page + 1)) - done - done - - # Normalize to the schema: number, title, body, labels, state - printf '%s' "$all_issues" | jq '[.[] | { - number: .number, - title: .title, - body: .body, - labels: [.labels[]?.name], - state: .state - }] | sort_by(.number)' -} - -# Create a backup tarball of factory state. -# Usage: backup_create -backup_create() { - local outfile="${1:-}" - if [ -z "$outfile" ]; then - echo "Error: output file required" >&2 - echo "Usage: disinto backup create " >&2 - return 1 - fi - - # Resolve to absolute path before cd-ing into tmpdir - case "$outfile" in - /*) ;; - *) outfile="$(pwd)/${outfile}" ;; - esac - - # Validate required env - : "${FORGE_URL:?FORGE_URL must be set}" - : "${FORGE_TOKEN:?FORGE_TOKEN must be set}" - : "${FORGE_REPO:?FORGE_REPO must be set}" - - local forge_ops_repo="${FORGE_OPS_REPO:-${FORGE_REPO}-ops}" - local ops_repo_root="${OPS_REPO_ROOT:-}" - - if [ -z "$ops_repo_root" ] || [ ! -d "$ops_repo_root/.git" ]; then - echo "Error: OPS_REPO_ROOT (${ops_repo_root:-}) is not a valid git repo" >&2 - return 1 - fi - - local tmpdir - tmpdir=$(mktemp -d) - trap 'rm -rf "$tmpdir"' EXIT - - local project_name="${FORGE_REPO##*/}" - - echo "=== disinto backup create ===" - echo "Forge: ${FORGE_URL}" - echo "Repos: ${FORGE_REPO}, ${forge_ops_repo}" - - # ── 1. Export issues ────────────────────────────────────────────────────── - mkdir -p "${tmpdir}/issues" - - echo "Fetching issues for ${FORGE_REPO}..." - _backup_fetch_issues "$FORGE_REPO" > "${tmpdir}/issues/${project_name}.json" - local main_count - main_count=$(jq 'length' "${tmpdir}/issues/${project_name}.json") - echo " ${main_count} issues exported" - - echo "Fetching issues for ${forge_ops_repo}..." - _backup_fetch_issues "$forge_ops_repo" > "${tmpdir}/issues/${project_name}-ops.json" - local ops_count - ops_count=$(jq 'length' "${tmpdir}/issues/${project_name}-ops.json") - echo " ${ops_count} issues exported" - - # ── 2. Git bundle of ops repo ──────────────────────────────────────────── - mkdir -p "${tmpdir}/repos" - - echo "Creating git bundle for ${forge_ops_repo}..." - git -C "$ops_repo_root" bundle create "${tmpdir}/repos/${project_name}-ops.bundle" --all 2>&1 - echo " bundle created ($(du -h "${tmpdir}/repos/${project_name}-ops.bundle" | cut -f1))" - - # ── 3. Metadata ────────────────────────────────────────────────────────── - local created_at - created_at=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - - jq -n \ - --arg created_at "$created_at" \ - --arg source_host "$(hostname)" \ - --argjson schema_version 1 \ - --arg forgejo_url "$FORGE_URL" \ - '{ - created_at: $created_at, - source_host: $source_host, - schema_version: $schema_version, - forgejo_url: $forgejo_url - }' > "${tmpdir}/metadata.json" - - # ── 4. Pack tarball ────────────────────────────────────────────────────── - echo "Creating tarball: ${outfile}" - tar -czf "$outfile" -C "$tmpdir" metadata.json issues repos - local size - size=$(du -h "$outfile" | cut -f1) - echo "=== Backup complete: ${outfile} (${size}) ===" - - # Clean up before returning — the EXIT trap references the local $tmpdir - # which goes out of scope after return, causing 'unbound variable' under set -u. - trap - EXIT - rm -rf "$tmpdir" -} diff --git a/lib/ci-helpers.sh b/lib/ci-helpers.sh index 6afe97b..11c668e 100644 --- a/lib/ci-helpers.sh +++ b/lib/ci-helpers.sh @@ -247,31 +247,6 @@ ci_promote() { echo "$new_num" } -# ci_get_step_logs -# Fetches logs for a single CI step via the Woodpecker API. -# Requires: WOODPECKER_REPO_ID, woodpecker_api() (from env.sh) -# Returns: 0 on success, 1 on failure. Outputs log text to stdout. -# -# Usage: -# ci_get_step_logs 1423 5 # Get logs for step ID 5 in pipeline 1423 -ci_get_step_logs() { - local pipeline_num="$1" step_id="$2" - - if [ -z "$pipeline_num" ] || [ -z "$step_id" ]; then - echo "Usage: ci_get_step_logs " >&2 - return 1 - fi - - if [ -z "${WOODPECKER_REPO_ID:-}" ] || [ "${WOODPECKER_REPO_ID}" = "0" ]; then - echo "ERROR: WOODPECKER_REPO_ID not set or zero" >&2 - return 1 - fi - - woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${pipeline_num}/${step_id}" \ - --max-time 15 2>/dev/null \ - | jq -r '.[].data // empty' 2>/dev/null -} - # ci_get_logs [--step ] # Reads CI logs from the Woodpecker SQLite database. # Requires: WOODPECKER_DATA_DIR env var or mounted volume at /woodpecker-data diff --git a/lib/disinto/backup.sh b/lib/disinto/backup.sh deleted file mode 100644 index 2c34bba..0000000 --- a/lib/disinto/backup.sh +++ /dev/null @@ -1,385 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# backup.sh — backup/restore utilities for disinto factory state -# -# Subcommands: -# create Create backup of factory state -# import Restore factory state from backup -# -# Usage: -# source "${FACTORY_ROOT}/lib/disinto/backup.sh" -# backup_import -# -# Environment: -# FORGE_URL - Forgejo instance URL (target) -# FORGE_TOKEN - Admin token for target Forgejo -# -# Idempotency: -# - Repos: created via API if missing -# - Issues: check if exists by number, skip if present -# - Runs twice = same end state, no errors -# ============================================================================= -set -euo pipefail - -# ── Helper: log with timestamp ─────────────────────────────────────────────── -backup_log() { - local msg="$1" - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $msg" -} - -# ── Helper: create repo if it doesn't exist ───────────────────────────────── -# Usage: backup_create_repo_if_missing -# Returns: 0 if repo exists or was created, 1 on error -backup_create_repo_if_missing() { - local slug="$1" - local org_name="${slug%%/*}" - local repo_name="${slug##*/}" - - # Check if repo exists - if curl -sf --max-time 5 \ - -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_URL}/api/v1/repos/${slug}" >/dev/null 2>&1; then - backup_log "Repo ${slug} already exists" - return 0 - fi - - backup_log "Creating repo ${slug}..." - - # Create org if needed - curl -sf -X POST \ - -H "Authorization: token ${FORGE_TOKEN}" \ - -H "Content-Type: application/json" \ - "${FORGE_URL}/api/v1/orgs" \ - -d "{\"username\":\"${org_name}\",\"visibility\":\"public\"}" >/dev/null 2>&1 || true - - # Create repo - local response - response=$(curl -sf -X POST \ - -H "Authorization: token ${FORGE_TOKEN}" \ - -H "Content-Type: application/json" \ - "${FORGE_URL}/api/v1/orgs/${org_name}/repos" \ - -d "{\"name\":\"${repo_name}\",\"auto_init\":false,\"default_branch\":\"main\"}" 2>/dev/null) \ - || response="" - - if [ -n "$response" ] && echo "$response" | grep -q '"id":\|[0-9]'; then - backup_log "Created repo ${slug}" - BACKUP_CREATED_REPOS=$((BACKUP_CREATED_REPOS + 1)) - return 0 - fi - - # Fallback: admin endpoint - response=$(curl -sf -X POST \ - -H "Authorization: token ${FORGE_TOKEN}" \ - -H "Content-Type: application/json" \ - "${FORGE_URL}/api/v1/admin/users/${org_name}/repos" \ - -d "{\"name\":\"${repo_name}\",\"auto_init\":false,\"default_branch\":\"main\"}" 2>/dev/null) \ - || response="" - - if [ -n "$response" ] && echo "$response" | grep -q '"id":\|[0-9]'; then - backup_log "Created repo ${slug} (via admin API)" - BACKUP_CREATED_REPOS=$((BACKUP_CREATED_REPOS + 1)) - return 0 - fi - - backup_log "ERROR: failed to create repo ${slug}" >&2 - return 1 -} - -# ── Helper: check if issue exists by number ────────────────────────────────── -# Usage: backup_issue_exists -# Returns: 0 if exists, 1 if not -backup_issue_exists() { - local slug="$1" - local issue_num="$2" - - curl -sf --max-time 5 \ - -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_URL}/api/v1/repos/${slug}/issues/${issue_num}" >/dev/null 2>&1 -} - -# ── Helper: create issue with specific number (if Forgejo supports it) ─────── -# Note: Forgejo API auto-assigns next integer; we accept renumbering and log mapping -# Usage: backup_create_issue <body> [labels...] -# Returns: new_issue_number on success, 0 on failure -backup_create_issue() { - local slug="$1" - local original_num="$2" - local title="$3" - local body="$4" - shift 4 - - # Build labels array - local -a labels=() - for label in "$@"; do - # Resolve label name to ID - local label_id - label_id=$(curl -sf --max-time 5 \ - -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_URL}/api/v1/repos/${slug}/labels" 2>/dev/null \ - | jq -r ".[] | select(.name == \"${label}\") | .id" 2>/dev/null) || label_id="" - - if [ -n "$label_id" ] && [ "$label_id" != "null" ]; then - labels+=("$label_id") - fi - done - - # Build payload - local payload - if [ ${#labels[@]} -gt 0 ]; then - payload=$(jq -n \ - --arg title "$title" \ - --arg body "$body" \ - --argjson labels "$(printf '%s\n' "${labels[@]}" | jq -R . | jq -s .)" \ - '{title: $title, body: $body, labels: $labels}') - else - payload=$(jq -n --arg title "$title" --arg body "$body" '{title: $title, body: $body, labels: []}') - fi - - local response - response=$(curl -sf -X POST \ - -H "Authorization: token ${FORGE_TOKEN}" \ - -H "Content-Type: application/json" \ - "${FORGE_URL}/api/v1/repos/${slug}/issues" \ - -d "$payload" 2>/dev/null) || { - backup_log "ERROR: failed to create issue '${title}'" >&2 - return 1 - } - - local new_num - new_num=$(printf '%s' "$response" | jq -r '.number // empty') - - # Log the mapping - echo "${original_num}:${new_num}" >> "${BACKUP_MAPPING_FILE}" - - backup_log "Created issue '${title}' as #${new_num} (original: #${original_num})" - echo "$new_num" -} - -# ── Step 1: Unpack tarball to temp dir ─────────────────────────────────────── -# Usage: backup_unpack_tarball <tarball> -# Returns: temp dir path via BACKUP_TEMP_DIR -backup_unpack_tarball() { - local tarball="$1" - - if [ ! -f "$tarball" ]; then - backup_log "ERROR: tarball not found: ${tarball}" >&2 - return 1 - fi - - BACKUP_TEMP_DIR=$(mktemp -d -t disinto-backup.XXXXXX) - backup_log "Unpacking ${tarball} to ${BACKUP_TEMP_DIR}" - - if ! tar -xzf "$tarball" -C "$BACKUP_TEMP_DIR"; then - backup_log "ERROR: failed to unpack tarball" >&2 - rm -rf "$BACKUP_TEMP_DIR" - return 1 - fi - - # Verify expected structure - if [ ! -d "${BACKUP_TEMP_DIR}/repos" ]; then - backup_log "ERROR: tarball missing 'repos/' directory" >&2 - rm -rf "$BACKUP_TEMP_DIR" - return 1 - fi - - backup_log "Tarball unpacked successfully" -} - -# ── Step 2: disinto repo — create via Forgejo API, trigger sync (manual) ───── -# Usage: backup_import_disinto_repo -# Returns: 0 on success, 1 on failure -backup_import_disinto_repo() { - backup_log "Step 2: Configuring disinto repo..." - - # Create disinto repo if missing - backup_create_repo_if_missing "disinto-admin/disinto" - - # Note: Manual mirror configuration recommended (avoids SSH deploy-key handling) - backup_log "Note: Configure Codeberg → Forgejo pull mirror manually" - backup_log " Run on Forgejo admin panel: Repository Settings → Repository Mirroring" - backup_log " Source: ssh://git@codeberg.org/johba/disinto.git" - backup_log " Mirror: disinto-admin/disinto" - backup_log " Or use: git clone --mirror ssh://git@codeberg.org/johba/disinto.git" - backup_log " cd disinto.git && git push --mirror ${FORGE_URL}/disinto-admin/disinto.git" - - return 0 -} - -# ── Step 3: disinto-ops repo — create empty, push from bundle ──────────────── -# Usage: backup_import_disinto_ops_repo -# Returns: 0 on success, 1 on failure -backup_import_disinto_ops_repo() { - backup_log "Step 3: Configuring disinto-ops repo from bundle..." - - local bundle_path="${BACKUP_TEMP_DIR}/repos/disinto-ops.bundle" - - if [ ! -f "$bundle_path" ]; then - backup_log "WARNING: Bundle not found at ${bundle_path}, skipping" - return 0 - fi - - # Create ops repo if missing - backup_create_repo_if_missing "disinto-admin/disinto-ops" - - # Clone bundle and push to Forgejo - local clone_dir - clone_dir=$(mktemp -d -t disinto-ops-clone.XXXXXX) - backup_log "Cloning bundle to ${clone_dir}" - - if ! git clone --bare "$bundle_path" "$clone_dir/disinto-ops.git"; then - backup_log "ERROR: failed to clone bundle" - rm -rf "$clone_dir" - return 1 - fi - - # Push all refs to Forgejo - backup_log "Pushing refs to Forgejo..." - if ! cd "$clone_dir/disinto-ops.git" && \ - git push --mirror "${FORGE_URL}/disinto-admin/disinto-ops.git" 2>&1; then - backup_log "ERROR: failed to push refs" - rm -rf "$clone_dir" - return 1 - fi - - local ref_count - ref_count=$(cd "$clone_dir/disinto-ops.git" && git show-ref | wc -l) - BACKUP_PUSHED_REFS=$((BACKUP_PUSHED_REFS + ref_count)) - - backup_log "Pushed ${ref_count} refs to disinto-ops" - rm -rf "$clone_dir" - - return 0 -} - -# ── Step 4: Import issues from backup ──────────────────────────────────────── -# Usage: backup_import_issues <slug> <issues_dir> -# Returns: 0 on success -backup_import_issues() { - local slug="$1" - local issues_dir="$2" - - if [ ! -d "$issues_dir" ]; then - backup_log "No issues directory found, skipping" - return 0 - fi - - local created=0 - local skipped=0 - - for issue_file in "${issues_dir}"/*.json; do - [ -f "$issue_file" ] || continue - - backup_log "Processing issue file: $(basename "$issue_file")" - - local issue_num title body - issue_num=$(jq -r '.number // empty' "$issue_file") - title=$(jq -r '.title // empty' "$issue_file") - body=$(jq -r '.body // empty' "$issue_file") - - if [ -z "$issue_num" ] || [ "$issue_num" = "null" ]; then - backup_log "WARNING: skipping issue without number: $(basename "$issue_file")" - continue - fi - - # Check if issue already exists - if backup_issue_exists "$slug" "$issue_num"; then - backup_log "Issue #${issue_num} already exists, skipping" - skipped=$((skipped + 1)) - continue - fi - - # Extract labels - local -a labels=() - while IFS= read -r label; do - [ -n "$label" ] && labels+=("$label") - done < <(jq -r '.labels[]? // empty' "$issue_file") - - # Create issue - local new_num - if new_num=$(backup_create_issue "$slug" "$issue_num" "$title" "$body" "${labels[@]}"); then - created=$((created + 1)) - fi - done - - BACKUP_CREATED_ISSUES=$((BACKUP_CREATED_ISSUES + created)) - BACKUP_SKIPPED_ISSUES=$((BACKUP_SKIPPED_ISSUES + skipped)) - - backup_log "Created ${created} issues, skipped ${skipped}" -} - -# ── Main: import subcommand ────────────────────────────────────────────────── -# Usage: backup_import <tarball> -backup_import() { - local tarball="$1" - - # Validate required environment - [ -n "${FORGE_URL:-}" ] || { echo "Error: FORGE_URL not set" >&2; exit 1; } - [ -n "${FORGE_TOKEN:-}" ] || { echo "Error: FORGE_TOKEN not set" >&2; exit 1; } - - backup_log "=== Backup Import Started ===" - backup_log "Target: ${FORGE_URL}" - backup_log "Tarball: ${tarball}" - - # Initialize counters - BACKUP_CREATED_REPOS=0 - BACKUP_PUSHED_REFS=0 - BACKUP_CREATED_ISSUES=0 - BACKUP_SKIPPED_ISSUES=0 - - # Create temp dir for mapping file - BACKUP_MAPPING_FILE=$(mktemp -t disinto-mapping.XXXXXX.json) - echo '{"mappings": []}' > "$BACKUP_MAPPING_FILE" - - # Step 1: Unpack tarball - if ! backup_unpack_tarball "$tarball"; then - exit 1 - fi - - # Step 2: disinto repo - if ! backup_import_disinto_repo; then - exit 1 - fi - - # Step 3: disinto-ops repo - if ! backup_import_disinto_ops_repo; then - exit 1 - fi - - # Step 4: Import issues for each repo with issues/*.json - for repo_dir in "${BACKUP_TEMP_DIR}/repos"/*/; do - [ -d "$repo_dir" ] || continue - - local slug - slug=$(basename "$repo_dir") - - backup_log "Processing repo: ${slug}" - - local issues_dir="${repo_dir}issues" - if [ -d "$issues_dir" ]; then - backup_import_issues "$slug" "$issues_dir" - fi - done - - # Summary - backup_log "=== Backup Import Complete ===" - backup_log "Created ${BACKUP_CREATED_REPOS} repos" - backup_log "Pushed ${BACKUP_PUSHED_REFS} refs" - backup_log "Imported ${BACKUP_CREATED_ISSUES} issues" - backup_log "Skipped ${BACKUP_SKIPPED_ISSUES} (already present)" - backup_log "Issue mapping saved to: ${BACKUP_MAPPING_FILE}" - - # Cleanup - rm -rf "$BACKUP_TEMP_DIR" - - exit 0 -} - -# ── Entry point: if sourced, don't run; if executed directly, run import ──── -if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then - if [ $# -lt 1 ]; then - echo "Usage: $0 <tarball>" >&2 - exit 1 - fi - - backup_import "$1" -fi diff --git a/lib/generators.sh b/lib/generators.sh index eb223e8..3053dfc 100644 --- a/lib/generators.sh +++ b/lib/generators.sh @@ -313,10 +313,6 @@ _generate_compose_impl() { return 0 fi - # Reset duplicate detection state for fresh run - _seen_services=() - _service_sources=() - # Initialize duplicate detection with base services defined in the template _record_service "forgejo" "base compose template" || return 1 _record_service "woodpecker" "base compose template" || return 1 @@ -405,9 +401,6 @@ services: WOODPECKER_SERVER: localhost:9000 WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-} WOODPECKER_GRPC_SECURE: "false" - WOODPECKER_GRPC_KEEPALIVE_TIME: "10s" - WOODPECKER_GRPC_KEEPALIVE_TIMEOUT: "20s" - WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS: "true" WOODPECKER_HEALTHCHECK_ADDR: ":3333" WOODPECKER_BACKEND_DOCKER_NETWORK: ${WOODPECKER_CI_NETWORK:-disinto_disinto-net} WOODPECKER_MAX_WORKFLOWS: 1 diff --git a/lib/issue-lifecycle.sh b/lib/issue-lifecycle.sh index 25f2c6b..1ad3239 100644 --- a/lib/issue-lifecycle.sh +++ b/lib/issue-lifecycle.sh @@ -157,10 +157,9 @@ issue_claim() { return 1 fi - local ip_id bl_id bk_id + local ip_id bl_id ip_id=$(_ilc_in_progress_id) bl_id=$(_ilc_backlog_id) - bk_id=$(_ilc_blocked_id) if [ -n "$ip_id" ]; then curl -sf -X POST \ -H "Authorization: token ${FORGE_TOKEN}" \ @@ -173,12 +172,6 @@ issue_claim() { -H "Authorization: token ${FORGE_TOKEN}" \ "${FORGE_API}/issues/${issue}/labels/${bl_id}" >/dev/null 2>&1 || true fi - # Clear blocked label on re-claim — starting work is implicit resolution of prior block - if [ -n "$bk_id" ]; then - curl -sf -X DELETE \ - -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_API}/issues/${issue}/labels/${bk_id}" >/dev/null 2>&1 || true - fi _ilc_log "claimed issue #${issue}" return 0 } diff --git a/lib/pr-lifecycle.sh b/lib/pr-lifecycle.sh index bca08f1..e097f34 100644 --- a/lib/pr-lifecycle.sh +++ b/lib/pr-lifecycle.sh @@ -429,100 +429,19 @@ pr_walk_to_merge() { _prl_log "CI failed — invoking agent (attempt ${ci_fix_count}/${max_ci_fixes})" - # Build per-workflow/per-step CI diagnostics prompt - local ci_prompt_body="" - local passing_workflows="" - local built_diagnostics=false - - if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${WOODPECKER_REPO_ID:-}" ]; then - local pip_json - pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_PR_CI_PIPELINE}" 2>/dev/null) || pip_json="" - - if [ -n "$pip_json" ]; then - local wf_count - wf_count=$(printf '%s' "$pip_json" | jq '[.workflows[]?] | length' 2>/dev/null) || wf_count=0 - - if [ "$wf_count" -gt 0 ]; then - built_diagnostics=true - local wf_idx=0 - while [ "$wf_idx" -lt "$wf_count" ]; do - local wf_name wf_state - wf_name=$(printf '%s' "$pip_json" | jq -r ".workflows[$wf_idx].name // \"workflow-$wf_idx\"" 2>/dev/null) - wf_state=$(printf '%s' "$pip_json" | jq -r ".workflows[$wf_idx].state // \"unknown\"" 2>/dev/null) - - if [ "$wf_state" = "failure" ] || [ "$wf_state" = "error" ] || [ "$wf_state" = "killed" ]; then - # Collect failed children for this workflow - local failed_children - failed_children=$(printf '%s' "$pip_json" | jq -r " - .workflows[$wf_idx].children[]? | - select(.state == \"failure\" or .state == \"error\" or .state == \"killed\") | - \"\(.name)\t\(.exit_code)\t\(.pid)\"" 2>/dev/null) || failed_children="" - - ci_prompt_body="${ci_prompt_body} ---- Failed workflow: ${wf_name} ---" - if [ -n "$failed_children" ]; then - while IFS=$'\t' read -r step_name step_exit step_pid; do - [ -z "$step_name" ] && continue - local exit_annotation="" - case "$step_exit" in - 126) exit_annotation=" (permission denied or not executable)" ;; - 127) exit_annotation=" (command not found)" ;; - 128) exit_annotation=" (invalid exit argument / signal+128)" ;; - esac - ci_prompt_body="${ci_prompt_body} - Step: ${step_name} - Exit code: ${step_exit}${exit_annotation}" - - # Fetch per-step logs - if [ -n "$step_pid" ] && [ "$step_pid" != "null" ]; then - local step_logs - step_logs=$(ci_get_step_logs "$_PR_CI_PIPELINE" "$step_pid" 2>/dev/null | tail -50) || step_logs="" - if [ -n "$step_logs" ]; then - ci_prompt_body="${ci_prompt_body} - Log tail (last 50 lines): -\`\`\` -${step_logs} -\`\`\`" - fi - fi - done <<< "$failed_children" - else - ci_prompt_body="${ci_prompt_body} - (no failed step details available)" - fi - else - # Track passing/other workflows - if [ -n "$passing_workflows" ]; then - passing_workflows="${passing_workflows}, ${wf_name}" - else - passing_workflows="${wf_name}" - fi - fi - wf_idx=$((wf_idx + 1)) - done - fi - fi + # Get CI logs from SQLite database if available + local ci_logs="" + if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${FACTORY_ROOT:-}" ]; then + ci_logs=$(ci_get_logs "$_PR_CI_PIPELINE" 2>/dev/null | tail -50) || ci_logs="" fi - # Fallback: use legacy log fetch if per-workflow diagnostics unavailable - if [ "$built_diagnostics" = false ]; then - local ci_logs="" - if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${FACTORY_ROOT:-}" ]; then - ci_logs=$(ci_get_logs "$_PR_CI_PIPELINE" 2>/dev/null | tail -50) || ci_logs="" - fi - if [ -n "$ci_logs" ]; then - ci_prompt_body=" + local logs_section="" + if [ -n "$ci_logs" ]; then + logs_section=" CI Log Output (last 50 lines): \`\`\` ${ci_logs} -\`\`\`" - fi - fi - - local passing_line="" - if [ -n "$passing_workflows" ]; then - passing_line=" -Passing workflows (do not modify): ${passing_workflows} +\`\`\` " fi @@ -531,10 +450,9 @@ Passing workflows (do not modify): ${passing_workflows} Pipeline: #${_PR_CI_PIPELINE:-?} Failure type: ${_PR_CI_FAILURE_TYPE:-unknown} -${passing_line} + Error log: -${_PR_CI_ERROR_LOG:-No logs available.} -${ci_prompt_body} +${_PR_CI_ERROR_LOG:-No logs available.}${logs_section} Fix the issue, run tests, commit, rebase on ${PRIMARY_BRANCH}, and push: git fetch ${remote} ${PRIMARY_BRANCH} && git rebase ${remote}/${PRIMARY_BRANCH} diff --git a/nomad/AGENTS.md b/nomad/AGENTS.md index f5f2f7a..bf62f45 100644 --- a/nomad/AGENTS.md +++ b/nomad/AGENTS.md @@ -1,4 +1,4 @@ -<!-- last-reviewed: 343b928a264e667ae7614be2a72e00555c87c63e --> +<!-- last-reviewed: 0bb04545d47fb43b2cab0a1f4406c2a2b57f4eba --> # nomad/ — Agent Instructions Nomad + Vault HCL for the factory's single-node cluster. These files are diff --git a/nomad/jobs/woodpecker-agent.hcl b/nomad/jobs/woodpecker-agent.hcl index a4111fe..c7779a2 100644 --- a/nomad/jobs/woodpecker-agent.hcl +++ b/nomad/jobs/woodpecker-agent.hcl @@ -57,7 +57,7 @@ job "woodpecker-agent" { check { type = "http" path = "/healthz" - interval = "10s" + interval = "15s" timeout = "3s" } } @@ -89,13 +89,10 @@ job "woodpecker-agent" { # Nomad's port stanza to the allocation's IP (not localhost), so the # agent must use the LXC's eth0 IP, not 127.0.0.1. env { - WOODPECKER_SERVER = "${attr.unique.network.ip-address}:9000" - WOODPECKER_GRPC_SECURE = "false" - WOODPECKER_GRPC_KEEPALIVE_TIME = "10s" - WOODPECKER_GRPC_KEEPALIVE_TIMEOUT = "20s" - WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS = "true" - WOODPECKER_MAX_WORKFLOWS = "1" - WOODPECKER_HEALTHCHECK_ADDR = ":3333" + WOODPECKER_SERVER = "${attr.unique.network.ip-address}:9000" + WOODPECKER_GRPC_SECURE = "false" + WOODPECKER_MAX_WORKFLOWS = "1" + WOODPECKER_HEALTHCHECK_ADDR = ":3333" } # ── Vault-templated agent secret ────────────────────────────────── diff --git a/planner/AGENTS.md b/planner/AGENTS.md index a2c677c..911ff21 100644 --- a/planner/AGENTS.md +++ b/planner/AGENTS.md @@ -1,4 +1,4 @@ -<!-- last-reviewed: 343b928a264e667ae7614be2a72e00555c87c63e --> +<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea --> # Planner Agent **Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints), diff --git a/predictor/AGENTS.md b/predictor/AGENTS.md index ed7f24b..a263066 100644 --- a/predictor/AGENTS.md +++ b/predictor/AGENTS.md @@ -1,4 +1,4 @@ -<!-- last-reviewed: 343b928a264e667ae7614be2a72e00555c87c63e --> +<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea --> # Predictor Agent **Role**: Abstract adversary (the "goblin"). Runs a 2-step formula diff --git a/review/AGENTS.md b/review/AGENTS.md index 6590259..24606d1 100644 --- a/review/AGENTS.md +++ b/review/AGENTS.md @@ -1,4 +1,4 @@ -<!-- last-reviewed: 343b928a264e667ae7614be2a72e00555c87c63e --> +<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea --> # Review Agent **Role**: AI-powered PR review — post structured findings and formal diff --git a/review/review-pr.sh b/review/review-pr.sh index 09f6cb6..091025f 100755 --- a/review/review-pr.sh +++ b/review/review-pr.sh @@ -52,35 +52,8 @@ REVIEW_TMPDIR=$(mktemp -d) log() { printf '[%s] PR#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" >> "$LOGFILE"; } status() { printf '[%s] PR #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" > "$STATUSFILE"; log "$*"; } - -# cleanup — remove temp files (NOT lockfile — cleanup_on_exit handles that) -cleanup() { - rm -rf "$REVIEW_TMPDIR" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json" -} - -# cleanup_on_exit — defensive cleanup: remove lockfile if we own it, kill residual children -# This handles the case where review-pr.sh is terminated unexpectedly (e.g., watchdog SIGTERM) -cleanup_on_exit() { - local ec=$? - # Remove lockfile only if we own it (PID matches $$) - if [ -f "$LOCKFILE" ] && [ -n "$(cat "$LOCKFILE" 2>/dev/null)" ]; then - if [ "$(cat "$LOCKFILE" 2>/dev/null)" = "$$" ]; then - rm -f "$LOCKFILE" - log "cleanup_on_exit: removed lockfile (we owned it)" - fi - fi - # Kill any direct children that may have been spawned by this process - # (e.g., bash -c commands from Claude's Bash tool that didn't get reaped) - pkill -P $$ 2>/dev/null || true - # Call the main cleanup function to remove temp files - cleanup - exit "$ec" -} -trap cleanup_on_exit EXIT INT TERM - -# Note: EXIT trap is already set above. The cleanup function is still available for -# non-error exits (e.g., normal completion via exit 0 after verdict posted). -# When review succeeds, we want to skip lockfile removal since the verdict was posted. +cleanup() { rm -rf "$REVIEW_TMPDIR" "$LOCKFILE" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"; } +trap cleanup EXIT # ============================================================================= # LOG ROTATION @@ -131,7 +104,6 @@ if [ "$PR_STATE" != "open" ]; then log "SKIP: state=${PR_STATE}" worktree_cleanup "$WORKTREE" rm -f "$OUTPUT_FILE" "$SID_FILE" 2>/dev/null || true - rm -f "$LOCKFILE" exit 0 fi @@ -141,7 +113,7 @@ fi CI_STATE=$(ci_commit_status "$PR_SHA") CI_NOTE="" if ! ci_passed "$CI_STATE"; then - ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; rm -f "$LOCKFILE"; exit 0; } + ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; exit 0; } CI_NOTE=" (not required — non-code PR)" fi @@ -151,10 +123,10 @@ fi ALL_COMMENTS=$(forge_api_all "/issues/${PR_NUMBER}/comments") HAS_CMT=$(printf '%s' "$ALL_COMMENTS" | jq --arg s "$PR_SHA" \ '[.[]|select(.body|contains("<!-- reviewed: "+$s+" -->"))]|length') -[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; rm -f "$LOCKFILE"; exit 0; } +[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; exit 0; } HAS_FML=$(forge_api_all "/pulls/${PR_NUMBER}/reviews" | jq --arg s "$PR_SHA" \ '[.[]|select(.commit_id==$s)|select(.state!="COMMENT")]|length') -[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; rm -f "$LOCKFILE"; exit 0; } +[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; exit 0; } # ============================================================================= # RE-REVIEW DETECTION @@ -352,7 +324,3 @@ esac profile_write_journal "review-${PR_NUMBER}" "Review PR #${PR_NUMBER} (${VERDICT})" "${VERDICT,,}" "" || true log "DONE: ${VERDICT} (re-review: ${IS_RE_REVIEW})" - -# Remove lockfile on successful completion (cleanup_on_exit will also do this, -# but we do it here to avoid the trap running twice) -rm -f "$LOCKFILE" diff --git a/supervisor/AGENTS.md b/supervisor/AGENTS.md index 2027e44..23a3832 100644 --- a/supervisor/AGENTS.md +++ b/supervisor/AGENTS.md @@ -1,4 +1,4 @@ -<!-- last-reviewed: 343b928a264e667ae7614be2a72e00555c87c63e --> +<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea --> # Supervisor Agent **Role**: Health monitoring and auto-remediation, executed as a formula-driven diff --git a/tests/smoke-init.sh b/tests/smoke-init.sh index 8cd4fee..8f2f44f 100644 --- a/tests/smoke-init.sh +++ b/tests/smoke-init.sh @@ -15,7 +15,6 @@ set -euo pipefail FACTORY_ROOT="$(cd "$(dirname "$0")/.." && pwd)" -export FACTORY_ROOT_REAL="$FACTORY_ROOT" # Always use localhost for mock Forgejo (in case FORGE_URL is set from docker-compose) export FORGE_URL="http://localhost:3000" MOCK_BIN="/tmp/smoke-mock-bin" @@ -428,12 +427,14 @@ rm -rf /tmp/smoke-claude-shared /tmp/smoke-home-claude # ── 8. Test duplicate service name detection ────────────────────────────── echo "=== 8/8 Testing duplicate service name detection ===" -# Isolated factory root — do NOT touch the real ${FACTORY_ROOT}/projects/ -SMOKE_DUP_ROOT=$(mktemp -d) -mkdir -p "$SMOKE_DUP_ROOT/projects" -cat > "$SMOKE_DUP_ROOT/projects/duplicate-test.toml" <<'TOMLEOF' +# Clean up for duplicate test +rm -f "${FACTORY_ROOT}/projects/duplicate-test.toml" +rm -f "${FACTORY_ROOT}/docker-compose.yml" + +# Create a TOML that would conflict with ENABLE_LLAMA_AGENT +cat > "${FACTORY_ROOT}/projects/duplicate-test.toml" <<'TOMLEOF' name = "duplicate-test" -description = "dup-detection smoke" +description = "Test project for duplicate service detection" [ci] woodpecker_repo_id = "999" @@ -445,29 +446,26 @@ roles = ["dev"] forge_user = "llama-bot" TOMLEOF -# Call the generator directly — no `disinto init` to overwrite the TOML. -# FACTORY_ROOT tells generators.sh where projects/ + compose_file live. -( - export FACTORY_ROOT="$SMOKE_DUP_ROOT" - export ENABLE_LLAMA_AGENT=1 - # shellcheck disable=SC1091 - source "${FACTORY_ROOT_REAL:-$(cd "$(dirname "$0")/.." && pwd)}/lib/generators.sh" - # Use a temp file to capture output since pipefail will kill the pipeline - # when _generate_compose_impl returns non-zero - _generate_compose_impl > /tmp/smoke-dup-output.txt 2>&1 || true - if grep -q "Duplicate service name" /tmp/smoke-dup-output.txt; then - pass "Duplicate service detection: conflict between ENABLE_LLAMA_AGENT and [agents.llama] reported" - rm -f /tmp/smoke-dup-output.txt - exit 0 - else - fail "Duplicate service detection: no error raised for ENABLE_LLAMA_AGENT + [agents.llama]" - cat /tmp/smoke-dup-output.txt >&2 - rm -f /tmp/smoke-dup-output.txt - exit 1 - fi -) || FAILED=1 +# Run disinto init with ENABLE_LLAMA_AGENT=1 +# This should fail because [agents.llama] conflicts with ENABLE_LLAMA_AGENT +export ENABLE_LLAMA_AGENT="1" +export FORGE_URL="http://localhost:3000" +export SMOKE_FORGE_URL="$FORGE_URL" +export FORGE_ADMIN_PASS="smoke-test-password-123" +export SKIP_PUSH=true -rm -rf "$SMOKE_DUP_ROOT" +if bash "${FACTORY_ROOT}/bin/disinto" init \ + "duplicate-test" \ + --bare --yes \ + --forge-url "$FORGE_URL" \ + --repo-root "/tmp/smoke-test-repo" 2>&1 | grep -q "Duplicate service name 'agents-llama'"; then + pass "Duplicate service detection: correctly detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]" +else + fail "Duplicate service detection: should have detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]" +fi + +# Clean up +rm -f "${FACTORY_ROOT}/projects/duplicate-test.toml" unset ENABLE_LLAMA_AGENT # ── Summary ────────────────────────────────────────────────────────────────── diff --git a/tests/test-watchdog-process-group.sh b/tests/test-watchdog-process-group.sh deleted file mode 100755 index 54fedf9..0000000 --- a/tests/test-watchdog-process-group.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env bash -# test-watchdog-process-group.sh — Test that claude_run_with_watchdog kills orphan children -# -# This test verifies that when claude_run_with_watchdog terminates the Claude process, -# all child processes (including those spawned by Claude's Bash tool) are also killed. -# -# Reproducer scenario: -# 1. Create a fake "claude" stub that: -# a. Spawns a long-running child process (sleep 3600) -# b. Writes a result marker to stdout to trigger idle detection -# c. Stays running -# 2. Run claude_run_with_watchdog with the stub -# 3. Before the fix: sleep child survives (orphaned to PID 1) -# 4. After the fix: sleep child dies (killed as part of process group with -PID) -# -# Usage: ./tests/test-watchdog-process-group.sh - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" -TEST_TMP="/tmp/test-watchdog-$$" -LOGFILE="${TEST_TMP}/log.txt" -PASS=true - -# shellcheck disable=SC2317 -cleanup_test() { - rm -rf "$TEST_TMP" -} -trap cleanup_test EXIT INT TERM - -mkdir -p "$TEST_TMP" - -log() { - printf '[TEST] %s\n' "$*" | tee -a "$LOGFILE" -} - -fail() { - printf '[TEST] FAIL: %s\n' "$*" | tee -a "$LOGFILE" - PASS=false -} - -pass() { - printf '[TEST] PASS: %s\n' "$*" | tee -a "$LOGFILE" -} - -# Export required environment variables -export CLAUDE_TIMEOUT=10 # Short timeout for testing -export CLAUDE_IDLE_GRACE=2 # Short grace period for testing -export LOGFILE="${LOGFILE}" # Required by agent-sdk.sh - -# Create a fake claude stub that: -# 1. Spawns a long-running child process (sleep 3600) that will become an orphan if parent is killed -# 2. Writes a result marker to stdout (to trigger the watchdog's idle-after-result path) -# 3. Stays running so the watchdog can kill it -cat > "${TEST_TMP}/fake-claude" << 'FAKE_CLAUDE_EOF' -#!/usr/bin/env bash -# Fake claude that spawns a child and stays running -# Simulates Claude's behavior when it spawns a Bash tool command - -# Write result marker to stdout (triggers watchdog idle detection) -echo '{"type":"result","session_id":"test-session-123","verdict":"APPROVE"}' - -# Spawn a child that simulates Claude's Bash tool hanging -# This is the process that should be killed when the parent is terminated -sleep 3600 & -CHILD_PID=$! - -# Log the child PID for debugging -echo "FAKE_CLAUDE_CHILD_PID=$CHILD_PID" >&2 - -# Stay running - sleep in a loop so the watchdog can kill us -while true; do - sleep 3600 & - wait $! 2>/dev/null || true -done -FAKE_CLAUDE_EOF -chmod +x "${TEST_TMP}/fake-claude" - -log "Testing claude_run_with_watchdog process group cleanup..." - -# Source the library and run claude_run_with_watchdog -cd "$SCRIPT_DIR" -source lib/agent-sdk.sh - -log "Starting claude_run_with_watchdog with fake claude..." - -# Run the function directly (not as a script) -# We need to capture output and redirect stderr -OUTPUT_FILE="${TEST_TMP}/output.txt" -timeout 35 bash -c " - source '${SCRIPT_DIR}/lib/agent-sdk.sh' - CLAUDE_TIMEOUT=10 CLAUDE_IDLE_GRACE=2 LOGFILE='${LOGFILE}' claude_run_with_watchdog '${TEST_TMP}/fake-claude' > '${OUTPUT_FILE}' 2>&1 - exit \$? -" || true - -# Give the watchdog a moment to clean up -log "Waiting for cleanup..." -sleep 5 - -# More precise check: look for sleep 3600 processes -# These would be the orphans from our fake claude -ORPHAN_COUNT=$(pgrep -a sleep 2>/dev/null | grep -c "sleep 3600" 2>/dev/null || echo "0") - -if [ "$ORPHAN_COUNT" -gt 0 ]; then - log "Found $ORPHAN_COUNT orphan sleep 3600 processes:" - pgrep -a sleep | grep "sleep 3600" - fail "Orphan children found - process group cleanup did not work" -else - pass "No orphan children found - process group cleanup worked" -fi - -# Also verify that the fake claude itself is not running -FAKE_CLAUDE_COUNT=$(pgrep -c -f "fake-claude" 2>/dev/null || echo "0") -if [ "$FAKE_CLAUDE_COUNT" -gt 0 ]; then - log "Found $FAKE_CLAUDE_COUNT fake-claude processes still running" - fail "Fake claude process(es) still running" -else - pass "Fake claude process terminated" -fi - -# Summary -echo "" -if [ "$PASS" = true ]; then - log "All tests passed!" - exit 0 -else - log "Some tests failed. See log at $LOGFILE" - exit 1 -fi diff --git a/vault/policies/AGENTS.md b/vault/policies/AGENTS.md index 3127822..9a4b588 100644 --- a/vault/policies/AGENTS.md +++ b/vault/policies/AGENTS.md @@ -1,4 +1,4 @@ -<!-- last-reviewed: 343b928a264e667ae7614be2a72e00555c87c63e --> +<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea --> # vault/policies/ — Agent Instructions HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per