diff --git a/AGENTS.md b/AGENTS.md index 09879cf..2416d0a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -13,16 +13,14 @@ See `README.md` for the full architecture and `BOOTSTRAP.md` for setup. ## Directory layout ``` -disinto/ +disinto/ (code repo) ├── dev/ dev-poll.sh, dev-agent.sh, phase-handler.sh — issue implementation ├── review/ review-poll.sh, review-pr.sh — PR review ├── gardener/ gardener-run.sh — direct cron executor for run-gardener formula ├── predictor/ predictor-run.sh — daily cron executor for run-predictor formula ├── planner/ planner-run.sh — direct cron executor for run-planner formula -│ planner/journal/ — daily raw logs from each planner run ├── supervisor/ supervisor-run.sh — formula-driven health monitoring (cron wrapper) │ preflight.sh — pre-flight data collection for supervisor formula -│ supervisor/journal/ — daily health logs from each run │ supervisor-poll.sh — legacy bash orchestrator (superseded) ├── vault/ vault-poll.sh, vault-agent.sh, vault-fire.sh — action gating + procurement ├── action/ action-poll.sh, action-agent.sh — operational task execution @@ -30,6 +28,21 @@ disinto/ ├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored) ├── formulas/ Issue templates (TOML specs for multi-step agent tasks) └── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md) + +disinto-ops/ (ops repo — {project}-ops) +├── vault/ +│ ├── pending/ vault items awaiting approval +│ ├── approved/ approved vault items +│ ├── fired/ executed vault items +│ └── rejected/ rejected vault items +├── journal/ +│ ├── planner/ daily planning logs +│ └── supervisor/ operational health logs +├── knowledge/ shared agent knowledge + best practices +├── evidence/ engagement data, experiment results +├── portfolio.md addressables + observables +├── prerequisites.md dependency graph +└── RESOURCES.md accounts, tokens (refs), infra inventory ``` > **Terminology note:** "Formulas" in this repo are TOML issue templates in `formulas/` that diff --git a/RESOURCES.md b/RESOURCES.md deleted file mode 100644 index 35138f4..0000000 --- a/RESOURCES.md +++ /dev/null @@ -1,37 +0,0 @@ -# RESOURCES.md — Factory Capability Inventory - -## harb-staging -- type: compute -- capability: run disinto agents, serve website, CI server -- agents: dev, review, action, gardener, supervisor, planner, predictor -- ram: 8GB -- note: disinto-only — no other project agents on this box - -## codeberg-johba -- type: source-control -- capability: host repos, issue tracker, PR workflow, API access -- repos: johba/disinto -- note: owner account - -## codeberg-disinto-bot -- type: source-control -- capability: review PRs, merge PRs, push branches -- repos: johba/disinto -- note: bot account, push+pull permissions, no admin - -## woodpecker-ci -- type: ci -- capability: run pipelines on PR and push events, docker backend -- url: ci.niovi.voyage -- note: self-hosted on harb-staging - -## disinto-ai -- type: asset -- capability: static site, landing page, dashboard -- domain: disinto.ai, www.disinto.ai -- note: served by Caddy on harb-staging - -## telegram-clawy -- type: communication -- capability: notify human, collect decisions, relay vault requests -- note: OpenClaw bot, human's primary interface diff --git a/bin/disinto b/bin/disinto index 873b055..558efb1 100755 --- a/bin/disinto +++ b/bin/disinto @@ -699,6 +699,127 @@ setup_forge() { echo "Forge: ${forge_url} (ready)" } +# Create and seed the {project}-ops repo on Forgejo with initial directory structure. +# The ops repo holds operational data: vault items, journals, evidence, prerequisites. +setup_ops_repo() { + local forge_url="$1" ops_slug="$2" ops_root="$3" primary_branch="${4:-main}" + local org_name="${ops_slug%%/*}" + local ops_name="${ops_slug##*/}" + + echo "" + echo "── Ops repo setup ─────────────────────────────────────" + + # Check if ops repo already exists on Forgejo + if curl -sf --max-time 5 \ + -H "Authorization: token ${FORGE_TOKEN}" \ + "${forge_url}/api/v1/repos/${ops_slug}" >/dev/null 2>&1; then + echo "Ops repo: ${ops_slug} (already exists on Forgejo)" + else + # Create ops repo under org + if ! curl -sf -X POST \ + -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \ + -H "Content-Type: application/json" \ + "${forge_url}/api/v1/orgs/${org_name}/repos" \ + -d "{\"name\":\"${ops_name}\",\"auto_init\":true,\"default_branch\":\"${primary_branch}\",\"description\":\"Operational data for ${org_name}/${ops_name%-ops}\"}" >/dev/null 2>&1; then + # Fallback: create under the user + curl -sf -X POST \ + -H "Authorization: token ${FORGE_TOKEN}" \ + -H "Content-Type: application/json" \ + "${forge_url}/api/v1/user/repos" \ + -d "{\"name\":\"${ops_name}\",\"auto_init\":true,\"default_branch\":\"${primary_branch}\",\"description\":\"Operational data\"}" >/dev/null 2>&1 || true + fi + + # Add all bot users as collaborators + local bot_user + for bot_user in dev-bot review-bot planner-bot gardener-bot vault-bot supervisor-bot predictor-bot action-bot; do + curl -sf -X PUT \ + -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \ + -H "Content-Type: application/json" \ + "${forge_url}/api/v1/repos/${ops_slug}/collaborators/${bot_user}" \ + -d '{"permission":"write"}' >/dev/null 2>&1 || true + done + + echo "Ops repo: ${ops_slug} created on Forgejo" + fi + + # Clone ops repo locally if not present + if [ ! -d "${ops_root}/.git" ]; then + local auth_url + auth_url=$(printf '%s' "$forge_url" | sed "s|://|://dev-bot:${FORGE_TOKEN}@|") + local clone_url="${auth_url}/${ops_slug}.git" + echo "Cloning: ops repo -> ${ops_root}" + git clone --quiet "$clone_url" "$ops_root" 2>/dev/null || { + echo "Initializing: ops repo at ${ops_root}" + mkdir -p "$ops_root" + git -C "$ops_root" init --initial-branch="${primary_branch}" -q + } + else + echo "Ops repo: ${ops_root} (already exists locally)" + fi + + # Seed directory structure + local seeded=false + mkdir -p "${ops_root}/vault/pending" + mkdir -p "${ops_root}/vault/approved" + mkdir -p "${ops_root}/vault/fired" + mkdir -p "${ops_root}/vault/rejected" + mkdir -p "${ops_root}/journal/planner" + mkdir -p "${ops_root}/journal/supervisor" + mkdir -p "${ops_root}/knowledge" + mkdir -p "${ops_root}/evidence/engagement" + + if [ ! -f "${ops_root}/README.md" ]; then + cat > "${ops_root}/README.md" < "${ops_root}/portfolio.md"; seeded=true; } + [ -f "${ops_root}/prerequisites.md" ] || { echo "# Prerequisite Tree" > "${ops_root}/prerequisites.md"; seeded=true; } + [ -f "${ops_root}/RESOURCES.md" ] || { echo "# Resources" > "${ops_root}/RESOURCES.md"; seeded=true; } + + # Commit and push seed content + if [ "$seeded" = true ] && [ -d "${ops_root}/.git" ]; then + git -C "$ops_root" add -A + if ! git -C "$ops_root" diff --cached --quiet 2>/dev/null; then + git -C "$ops_root" commit -m "chore: seed ops repo structure" -q + # Push if remote exists + if git -C "$ops_root" remote get-url origin >/dev/null 2>&1; then + git -C "$ops_root" push origin "${primary_branch}" -q 2>/dev/null || true + fi + fi + echo "Seeded: ops repo with initial structure" + fi +} + # Push local clone to the Forgejo remote. push_to_forge() { local repo_root="$1" forge_url="$2" repo_slug="$3" @@ -874,8 +995,10 @@ generate_toml() { name = "${name}" repo = "${repo}" +ops_repo = "${repo}-ops" forge_url = "${forge_url}" repo_root = "${root}" +ops_repo_root = "/home/${USER}/${name}-ops" primary_branch = "${branch}" [ci] @@ -1290,6 +1413,11 @@ p.write_text(text) fi echo "Branch: ${branch}" + # Set up {project}-ops repo (#757) + local ops_slug="${forge_repo}-ops" + local ops_root="/home/${USER}/${project_name}-ops" + setup_ops_repo "$forge_url" "$ops_slug" "$ops_root" "$branch" + # Generate project TOML (skip if already exists) if [ "$toml_exists" = false ]; then # Prompt for CI ID if interactive and not already set via flag diff --git a/docs/EVIDENCE-ARCHITECTURE.md b/docs/EVIDENCE-ARCHITECTURE.md index f181978..944f47f 100644 --- a/docs/EVIDENCE-ARCHITECTURE.md +++ b/docs/EVIDENCE-ARCHITECTURE.md @@ -35,14 +35,14 @@ Different domains have different platforms: Agents won't need to understand each platform. **Processes act as adapters** — they will read a platform's API and write structured evidence to git. ``` -[Caddy logs] ──→ collect-engagement process ──→ evidence/engagement/YYYY-MM-DD.json -[Google Analytics] ──→ measure-funnel process ──→ evidence/funnel/YYYY-MM-DD.json -[Ponder GraphQL] ──→ measure-protocol process ──→ evidence/protocol/YYYY-MM-DD.json -[System stats] ──→ measure-resources process ──→ evidence/resources/YYYY-MM-DD.json -[Playwright] ──→ run-user-test process ──→ evidence/user-test/YYYY-MM-DD.json +[Caddy logs] ──→ collect-engagement process ──→ {project}-ops/evidence/engagement/YYYY-MM-DD.json +[Google Analytics] ──→ measure-funnel process ──→ {project}-ops/evidence/funnel/YYYY-MM-DD.json +[Ponder GraphQL] ──→ measure-protocol process ──→ {project}-ops/evidence/protocol/YYYY-MM-DD.json +[System stats] ──→ measure-resources process ──→ {project}-ops/evidence/resources/YYYY-MM-DD.json +[Playwright] ──→ run-user-test process ──→ {project}-ops/evidence/user-test/YYYY-MM-DD.json ``` -The planner will read `evidence/` — not Analytics, not Ponder, not DigitalOcean. Evidence is the normalized interface between the world and decisions. +The planner will read `$OPS_REPO_ROOT/evidence/` — not Analytics, not Ponder, not DigitalOcean. Evidence is the normalized interface between the world and decisions. > **Terminology note — "process" vs "formula":** In this document, "process" means a self-contained measurement or mutation pipeline that reads an external platform and writes structured evidence to git. This is distinct from disinto's "formulas" (`formulas/*.toml`), which are TOML issue templates that guide agents through multi-step operational work (see `AGENTS.md` § Directory layout). Processes produce evidence; formulas orchestrate agent tasks. diff --git a/evidence/engagement/.gitkeep b/evidence/engagement/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/formulas/groom-backlog.toml b/formulas/groom-backlog.toml index 7e50328..7915a80 100644 --- a/formulas/groom-backlog.toml +++ b/formulas/groom-backlog.toml @@ -96,7 +96,7 @@ The dev-agent is completely starved until they are promoted or resolved. For each tier-0 issue: - Read the full body: curl -sf -H "Authorization: token $FORGE_TOKEN" "$FORGE_API/issues/{number}" - If resolvable: promote to backlog — add acceptance criteria, affected files, relabel - - If needs human decision: file a vault procurement item (vault/pending/.md) + - If needs human decision: file a vault procurement item ($OPS_REPO_ROOT/vault/pending/.md) - If invalid / wontfix: close with explanation comment After completing all tier-0, re-fetch to check for new blockers: @@ -136,7 +136,7 @@ DUPLICATE (>80% overlap after reading both bodies — confirm before closing): Write: echo "ACTION: closed #NNN as duplicate of #OLDER" >> "$RESULT_FILE" VAULT (ambiguous scope, architectural question, needs human decision): - File a vault procurement item at $PROJECT_REPO_ROOT/vault/pending/.md: + File a vault procurement item at $OPS_REPO_ROOT/vault/pending/.md: # ## What @@ -144,7 +144,7 @@ VAULT (ambiguous scope, architectural question, needs human decision): ## Unblocks - #NNN — - Log: echo "VAULT: filed vault/pending/<id>.md for #NNN — <reason>" >> "$RESULT_FILE" + Log: echo "VAULT: filed $OPS_REPO_ROOT/vault/pending/<id>.md for #NNN — <reason>" >> "$RESULT_FILE" Dust vs ore rules: Dust: comment fix, variable rename, whitespace/formatting, single-line edit, trivial cleanup with no behavior change diff --git a/formulas/review-pr.toml b/formulas/review-pr.toml index 47427bb..b74f1e3 100644 --- a/formulas/review-pr.toml +++ b/formulas/review-pr.toml @@ -63,7 +63,7 @@ Do NOT flag: ## 4. Vault item quality (conditional) -If the PR adds or modifies files in `vault/pending/*.md`, apply these +If the PR adds or modifies vault item files (`vault/pending/*.md` in the ops repo), apply these additional checks. These criteria apply ON TOP of the normal review — a vault PR must also pass the standard checklist above. @@ -102,9 +102,9 @@ propose a specific action. ### Dedup check -Check whether `vault/pending/`, `vault/approved/`, or `vault/fired/` +Check whether `$OPS_REPO_ROOT/vault/pending/`, `$OPS_REPO_ROOT/vault/approved/`, or `$OPS_REPO_ROOT/vault/fired/` already contains a similar item (same resource, same ask). List the -vault directories to inspect existing items. If a duplicate or +vault directories in the ops repo to inspect existing items. If a duplicate or near-duplicate exists, REQUEST_CHANGES and reference the existing item. ## 5. External action detection (token separation) @@ -112,7 +112,7 @@ near-duplicate exists, REQUEST_CHANGES and reference the existing item. Agents must NEVER execute external actions directly. Any action that touches an external system (publish, deploy, post, push to external registry, API calls to third-party services) MUST go through vault dispatch — i.e., the -agent files a vault item (`vault/pending/*.json`) and the vault-runner +agent files a vault item (`$OPS_REPO_ROOT/vault/pending/*.json`) and the vault-runner container executes it with injected secrets. Scan the diff for these patterns: @@ -128,7 +128,7 @@ Scan the diff for these patterns: If ANY of these patterns appear in agent code (scripts in `dev/`, `action/`, `planner/`, `gardener/`, `supervisor/`, `predictor/`, `review/`, `formulas/`, -`lib/`) WITHOUT routing through vault dispatch (`vault/pending/`, `vault-fire.sh`, +`lib/`) WITHOUT routing through vault dispatch (`$OPS_REPO_ROOT/vault/pending/`, `vault-fire.sh`, `vault-run-action.sh`), **REQUEST_CHANGES**. Explain that external actions must use vault dispatch per AD-006. The agent diff --git a/formulas/run-gardener.toml b/formulas/run-gardener.toml index d39d9db..a262ac2 100644 --- a/formulas/run-gardener.toml +++ b/formulas/run-gardener.toml @@ -120,7 +120,7 @@ DUST (trivial — single-line edit, rename, comment, style, whitespace): of 3+ into one backlog issue. VAULT (needs human decision or external resource): - File a vault procurement item at $PROJECT_REPO_ROOT/vault/pending/<id>.md: + File a vault procurement item at $OPS_REPO_ROOT/vault/pending/<id>.md: # <What decision or resource is needed> ## What <description> @@ -128,7 +128,7 @@ VAULT (needs human decision or external resource): <which issue this unblocks> ## Unblocks - #NNN — <title> - Log: echo "VAULT: filed vault/pending/<id>.md for #NNN — <reason>" >> "$RESULT_FILE" + Log: echo "VAULT: filed $OPS_REPO_ROOT/vault/pending/<id>.md for #NNN — <reason>" >> "$RESULT_FILE" CLEAN (only if truly nothing to do): echo 'CLEAN' >> "$RESULT_FILE" diff --git a/formulas/run-planner.toml b/formulas/run-planner.toml index fbebb7b..3848fce 100644 --- a/formulas/run-planner.toml +++ b/formulas/run-planner.toml @@ -21,8 +21,9 @@ version = 4 model = "opus" [context] -files = ["VISION.md", "AGENTS.md", "RESOURCES.md", "planner/prerequisite-tree.md"] -# Recent planner/journal/*.md files + graph report loaded by planner-run.sh +files = ["VISION.md", "AGENTS.md"] +# RESOURCES.md and prerequisites.md loaded from ops repo (ops: prefix) +# Recent journal/planner/*.md files + graph report loaded by planner-run.sh [[steps]] id = "preflight" @@ -40,10 +41,10 @@ description = """ HEAD_SHA=$(git rev-parse HEAD) echo "$HEAD_SHA" > /tmp/planner-head-sha -4. Read the planner memory file at: $PROJECT_REPO_ROOT/planner/MEMORY.md +4. Read the planner memory file at: $OPS_REPO_ROOT/knowledge/planner-memory.md If it does not exist, this is the first planning run. -5. Read the prerequisite tree at: $PROJECT_REPO_ROOT/planner/prerequisite-tree.md +5. Read the prerequisite tree at: $OPS_REPO_ROOT/prerequisites.md If it does not exist, create an initial tree from VISION.md in the next step. 6. Read the graph report injected into the prompt (## Structural analysis). @@ -121,7 +122,7 @@ Update the tree: 2. Recalculate objective status (READY/BLOCKED/DONE) 3. Add new prerequisites discovered from graph report 4. Add new objectives from VISION.md not yet in tree - 5. Check vault state: vault/pending/*.md + vault/approved/*.md (blocked-on-vault), vault/fired/*.md (resolved?) + 5. Check vault state: $OPS_REPO_ROOT/vault/pending/*.md + $OPS_REPO_ROOT/vault/approved/*.md (blocked-on-vault), $OPS_REPO_ROOT/vault/fired/*.md (resolved?) 6. Check RESOURCES.md for newly available capabilities Bounce/stuck detection — for issues in the tree, fetch recent comments: @@ -141,7 +142,7 @@ Tree format: ## Objective: <name> (#issue or description) - [x] Resolved prerequisite (reference) - [ ] Unresolved prerequisite (#issue or description) - - [ ] Resource need blocked-on-vault (vault/pending/<id>.md) + - [ ] Resource need blocked-on-vault ($OPS_REPO_ROOT/vault/pending/<id>.md) Status: READY | BLOCKED — <reason> | DONE ### Part C: File at constraints @@ -157,7 +158,7 @@ Stuck issue handling: procurement item instead of skipping. First check for duplicates across ALL vault directories (pending/, approved/, fired/) — if a file with the same slug already exists in any of them, do NOT create a new one. - Naming: vault/pending/<project>-<slug>.md (e.g. disinto-github-org.md). + Naming: $OPS_REPO_ROOT/vault/pending/<project>-<slug>.md (e.g. disinto-github-org.md). Write with this template: # Request: <short description> @@ -181,7 +182,7 @@ Stuck issue handling: ## Unblocks - #<issue> — <title> - Then mark the prerequisite in the tree as "blocked-on-vault (vault/pending/<id>.md)". + Then mark the prerequisite in the tree as "blocked-on-vault ($OPS_REPO_ROOT/vault/pending/<id>.md)". Do NOT skip or mark as "awaiting human decision" — the vault owns the human interface. Filing gate (for non-stuck constraints): @@ -197,9 +198,9 @@ Priority label sync: "$FORGE_API/issues/<num>/labels/<priority_label_id>" Vault procurement: if a constraint needs a resource not in RESOURCES.md with -recurring cost, create vault/pending/<project>-<slug>.md instead of an issue. +recurring cost, create $OPS_REPO_ROOT/vault/pending/<project>-<slug>.md instead of an issue. Use the same template as HUMAN_BLOCKED above (What/Why/Human action/Factory will then/Unblocks). -Dedup: check vault/pending/ + vault/approved/ + vault/fired/ before creating. +Dedup: check $OPS_REPO_ROOT/vault/pending/ + $OPS_REPO_ROOT/vault/approved/ + $OPS_REPO_ROOT/vault/fired/ before creating. Rules: - Action budget: the planner may create at most (predictions_addressed + 1) @@ -220,10 +221,10 @@ id = "journal-and-commit" title = "Write tree, journal, optional memory; commit and PR" description = """ ### 1. Write prerequisite tree -Write to: $PROJECT_REPO_ROOT/planner/prerequisite-tree.md +Write to: $OPS_REPO_ROOT/prerequisites.md ### 2. Write journal entry -Create/append to: $PROJECT_REPO_ROOT/planner/journal/$(date -u +%Y-%m-%d).md +Create/append to: $OPS_REPO_ROOT/journal/planner/$(date -u +%Y-%m-%d).md Format: # Planner run — YYYY-MM-DD HH:MM UTC @@ -242,7 +243,7 @@ Format: (or "No stuck issues detected") ## Vault items filed - - vault/pending/<id>.md — <what> — blocks #NNN + - $OPS_REPO_ROOT/vault/pending/<id>.md — <what> — blocks #NNN (or "No vault items filed") ## Issues created @@ -261,28 +262,21 @@ Keep concise — 30-50 lines max. ### 3. Memory update (every 5th run) Count "# Planner run —" headers across all journal files. -Check "<!-- summarized-through-run: N -->" in MEMORY.md. -If (count - N) >= 5 or MEMORY.md missing, write to: - $PROJECT_REPO_ROOT/planner/MEMORY.md +Check "<!-- summarized-through-run: N -->" in planner-memory.md. +If (count - N) >= 5 or planner-memory.md missing, write to: + $OPS_REPO_ROOT/knowledge/planner-memory.md Include: run counter marker, date, constraint focus, patterns, direction. Keep under 100 lines. Replace entire file. -### 4. Commit and PR -If no file changes (git status --porcelain), skip. -Otherwise: - BRANCH="chore/planner-$(date -u +%Y%m%d-%H%M)" - git checkout -B "$BRANCH" - git add planner/prerequisite-tree.md planner/journal/ planner/MEMORY.md vault/pending/ +### 4. Commit ops repo changes +Commit the ops repo changes (prerequisites, journal, memory, vault items): + cd "$OPS_REPO_ROOT" + git add prerequisites.md journal/planner/ knowledge/planner-memory.md vault/pending/ git add -u - git diff --cached --quiet && skip - git commit -m "chore: planner run $(date -u +%Y-%m-%d)" - git push -u origin "$BRANCH" - Create PR via forge API: - curl -sf -X POST -H "Authorization: token $FORGE_TOKEN" \ - -H "Content-Type: application/json" "$FORGE_API/pulls" \ - -d '{"title":"chore: planner run — prerequisite tree update", - "head":"<branch>","base":"<primary-branch>", - "body":"Automated planner run — prerequisite tree update and journal entry."}' - git checkout "$PRIMARY_BRANCH" + if ! git diff --cached --quiet; then + git commit -m "chore: planner run $(date -u +%Y-%m-%d)" + git push origin "$PRIMARY_BRANCH" + fi + cd "$PROJECT_REPO_ROOT" """ needs = ["triage-and-plan"] diff --git a/formulas/run-predictor.toml b/formulas/run-predictor.toml index e427382..eb14412 100644 --- a/formulas/run-predictor.toml +++ b/formulas/run-predictor.toml @@ -18,7 +18,8 @@ version = 3 model = "sonnet" [context] -files = ["AGENTS.md", "RESOURCES.md", "VISION.md", "planner/prerequisite-tree.md"] +files = ["AGENTS.md", "VISION.md"] +# RESOURCES.md and prerequisites.md loaded from ops repo (ops: prefix) graph_report = "Structural analysis JSON from lib/build-graph.py — orphans, cycles, thin objectives, bottlenecks" [[steps]] @@ -48,12 +49,12 @@ Set up the working environment and load your prediction history. unreviewed (planner hasn't seen it yet) 3. Read the prerequisite tree: - cat "$PROJECT_REPO_ROOT/planner/prerequisite-tree.md" + cat "$OPS_REPO_ROOT/prerequisites.md" 4. Count evidence per claim area: for dir in evidence/red-team evidence/holdout evidence/evolution evidence/user-test; do - echo "=== $dir ===$(find "$PROJECT_REPO_ROOT/$dir" -name '*.json' 2>/dev/null | wc -l) files" - find "$PROJECT_REPO_ROOT/$dir" -name '*.json' -printf '%T+ %p\n' 2>/dev/null | sort -r | head -3 + echo "=== $dir ===$(find "$OPS_REPO_ROOT/$dir" -name '*.json' 2>/dev/null | wc -l) files" + find "$OPS_REPO_ROOT/$dir" -name '*.json' -printf '%T+ %p\n' 2>/dev/null | sort -r | head -3 done 5. Check current system state (lightweight — don't over-collect): diff --git a/formulas/run-publish-site.toml b/formulas/run-publish-site.toml index cd3624f..2de4455 100644 --- a/formulas/run-publish-site.toml +++ b/formulas/run-publish-site.toml @@ -209,7 +209,7 @@ Check 2 — collect-engagement.sh is present in the repo: fi Check 3 — engagement evidence has been collected at least once: - EVIDENCE_DIR="$FACTORY_ROOT/evidence/engagement" + EVIDENCE_DIR="$OPS_REPO_ROOT/evidence/engagement" LATEST=$(ls -1t "$EVIDENCE_DIR"/*.json 2>/dev/null | head -1 || true) if [ -n "$LATEST" ]; then echo "OK: Latest engagement report: $LATEST" @@ -222,7 +222,7 @@ Check 3 — engagement evidence has been collected at least once: Summary: echo "" echo "Observable status: addressable=disinto.ai measurement=caddy-access-logs" - echo "Evidence path: evidence/engagement/YYYY-MM-DD.json" - echo "Consumer: planner reads evidence/engagement/ during gap analysis" + echo "Evidence path: \$OPS_REPO_ROOT/evidence/engagement/YYYY-MM-DD.json" + echo "Consumer: planner reads ops repo evidence/engagement/ during gap analysis" """ needs = ["verify"] diff --git a/formulas/run-supervisor.toml b/formulas/run-supervisor.toml index 6d9d15a..6f60905 100644 --- a/formulas/run-supervisor.toml +++ b/formulas/run-supervisor.toml @@ -34,12 +34,12 @@ and injected into your prompt above. Review them now. (24h grace period). Check the "Stale Phase Cleanup" section for any files cleaned or in grace period this run. -2. Check vault state: read vault/pending/*.md for any procurement items +2. Check vault state: read $OPS_REPO_ROOT/vault/pending/*.md for any procurement items the planner has filed. Note items relevant to the health assessment (e.g. a blocked resource that explains why the pipeline is stalled). 3. Read the supervisor journal for recent history: - JOURNAL_FILE="$FACTORY_ROOT/supervisor/journal/$(date -u +%Y-%m-%d).md" + JOURNAL_FILE="$OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md" if [ -f "$JOURNAL_FILE" ]; then cat "$JOURNAL_FILE"; fi 4. Note any values that cross these thresholds: @@ -151,7 +151,7 @@ For each finding from the health assessment, decide and execute an action. For P0-P2 issues that persist after auto-fix attempts, or issues requiring human judgment, file a vault procurement item: - Write $PROJECT_REPO_ROOT/vault/pending/supervisor-<issue-slug>.md: + Write $OPS_REPO_ROOT/vault/pending/supervisor-<issue-slug>.md: # <What is needed> ## What <description of the problem and why the supervisor cannot fix it> @@ -162,11 +162,11 @@ human judgment, file a vault procurement item: The vault-poll will notify the human and track the request. Read the relevant best-practices file before taking action: - cat "$FACTORY_ROOT/supervisor/best-practices/memory.md" # P0 - cat "$FACTORY_ROOT/supervisor/best-practices/disk.md" # P1 - cat "$FACTORY_ROOT/supervisor/best-practices/ci.md" # P2 CI - cat "$FACTORY_ROOT/supervisor/best-practices/dev-agent.md" # P2 agent - cat "$FACTORY_ROOT/supervisor/best-practices/git.md" # P2 git + cat "$OPS_REPO_ROOT/knowledge/memory.md" # P0 + cat "$OPS_REPO_ROOT/knowledge/disk.md" # P1 + cat "$OPS_REPO_ROOT/knowledge/ci.md" # P2 CI + cat "$OPS_REPO_ROOT/knowledge/dev-agent.md" # P2 agent + cat "$OPS_REPO_ROOT/knowledge/git.md" # P2 git Track what you fixed and what vault items you filed for the report step. """ @@ -208,7 +208,7 @@ description = """ Append a timestamped entry to the supervisor journal. File path: - $FACTORY_ROOT/supervisor/journal/$(date -u +%Y-%m-%d).md + $OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md If the file already exists (multiple runs per day), append a new section. If it does not exist, create it. diff --git a/lib/AGENTS.md b/lib/AGENTS.md index 265c65d..09847fa 100644 --- a/lib/AGENTS.md +++ b/lib/AGENTS.md @@ -14,7 +14,7 @@ sourced as needed. | `lib/formula-session.sh` | `acquire_cron_lock()`, `check_memory()`, `load_formula()`, `build_context_block()`, `consume_escalation_reply()`, `start_formula_session()`, `formula_phase_callback()`, `build_prompt_footer()`, `build_graph_section()`, `run_formula_and_monitor(AGENT [TIMEOUT] [CALLBACK])` — shared helpers for formula-driven cron agents (lock, memory guard, formula loading, prompt assembly, tmux session, monitor loop, crash recovery). `build_graph_section()` generates the structural-analysis section (runs `lib/build-graph.py`, formats JSON output) — previously duplicated in planner-run.sh and predictor-run.sh, now shared here. `formula_phase_callback()` handles `PHASE:escalate` (unified escalation path — kills the session). `run_formula_and_monitor` accepts an optional CALLBACK (default: `formula_phase_callback`) so callers can install custom merge-through or escalation handlers. | planner-run.sh, predictor-run.sh, gardener-run.sh, supervisor-run.sh, dev-agent.sh, action-agent.sh | | `lib/guard.sh` | `check_active(agent_name)` — reads `$FACTORY_ROOT/state/.{agent_name}-active`; exits 0 (skip) if the file is absent. Factory is off by default — state files must be created to enable each agent. **Logs a message to stderr** when skipping (`[check_active] SKIP: state file not found`), so agent dropout is visible in cron logs. Sourced by dev-poll.sh, review-poll.sh, action-poll.sh, predictor-run.sh, supervisor-run.sh. | cron entry points | | `lib/mirrors.sh` | `mirror_push()` — pushes `$PRIMARY_BRANCH` + tags to all configured mirror remotes (fire-and-forget background pushes). Reads `MIRROR_NAMES` and `MIRROR_*` vars exported by `load-project.sh` from the `[mirrors]` TOML section. Failures are logged but never block the pipeline. Sourced by dev-poll.sh and dev/phase-handler.sh — called after every successful merge. | dev-poll.sh, phase-handler.sh | -| `lib/build-graph.py` | Python tool: parses VISION.md, prerequisite-tree.md, AGENTS.md, formulas/*.toml, evidence/, and forge issues/labels into a NetworkX DiGraph. Runs structural analyses (orphaned objectives, stale prerequisites, thin evidence, circular deps) and outputs a JSON report. Used by `review-pr.sh` (per-PR changed-file analysis) and `predictor-run.sh` (full-project analysis) to provide structural context to Claude. | review-pr.sh, predictor-run.sh | +| `lib/build-graph.py` | Python tool: parses VISION.md, prerequisites.md (from ops repo), AGENTS.md, formulas/*.toml, evidence/ (from ops repo), and forge issues/labels into a NetworkX DiGraph. Runs structural analyses (orphaned objectives, stale prerequisites, thin evidence, circular deps) and outputs a JSON report. Used by `review-pr.sh` (per-PR changed-file analysis) and `predictor-run.sh` (full-project analysis) to provide structural context to Claude. | review-pr.sh, predictor-run.sh | | `lib/secret-scan.sh` | `scan_for_secrets()` — detects potential secrets (API keys, bearer tokens, private keys, URLs with embedded credentials) in text; returns 1 if secrets found. `redact_secrets()` — replaces detected secret patterns with `[REDACTED]`. | file-action-issue.sh, phase-handler.sh | | `lib/file-action-issue.sh` | `file_action_issue()` — dedup check, secret scan, label lookup, and issue creation for formula-driven cron wrappers. Sets `FILED_ISSUE_NUM` on success. Returns 4 if secrets detected in body. | (available for future use) | | `lib/tea-helpers.sh` | `tea_file_issue(title, body, labels...)` — create issue via tea CLI with secret scanning; sets `FILED_ISSUE_NUM`. `tea_relabel(issue_num, labels...)` — replace labels using tea's `edit` subcommand (not `label`). `tea_comment(issue_num, body)` — add comment with secret scanning. `tea_close(issue_num)` — close issue. All use `TEA_LOGIN` and `FORGE_REPO` from env.sh. Labels by name (no ID lookup). Tea binary download verified via sha256 checksum. Sourced by env.sh when `tea` binary is available. | env.sh (conditional) | diff --git a/lib/env.sh b/lib/env.sh index ef602dd..ca8d40e 100755 --- a/lib/env.sh +++ b/lib/env.sh @@ -86,6 +86,13 @@ export TEA_LOGIN export PROJECT_NAME="${PROJECT_NAME:-${FORGE_REPO##*/}}" export PROJECT_REPO_ROOT="${PROJECT_REPO_ROOT:-/home/${USER}/${PROJECT_NAME}}" export PRIMARY_BRANCH="${PRIMARY_BRANCH:-master}" + +# Ops repo: operational data (vault items, journals, evidence, prerequisites). +# Default convention: sibling directory named {project}-ops. +export OPS_REPO_ROOT="${OPS_REPO_ROOT:-/home/${USER}/${PROJECT_NAME}-ops}" + +# Forge repo slug for the ops repo (used by agents that commit to ops). +export FORGE_OPS_REPO="${FORGE_OPS_REPO:-${FORGE_REPO:+${FORGE_REPO}-ops}}" export WOODPECKER_REPO_ID="${WOODPECKER_REPO_ID:-}" export WOODPECKER_SERVER="${WOODPECKER_SERVER:-http://localhost:8000}" export CLAUDE_TIMEOUT="${CLAUDE_TIMEOUT:-7200}" diff --git a/lib/formula-session.sh b/lib/formula-session.sh index 0265c5d..93201c0 100644 --- a/lib/formula-session.sh +++ b/lib/formula-session.sh @@ -67,37 +67,91 @@ load_formula() { # build_context_block FILE [FILE ...] # Reads each file from $PROJECT_REPO_ROOT and builds CONTEXT_BLOCK. +# Files prefixed with "ops:" are read from $OPS_REPO_ROOT instead. build_context_block() { CONTEXT_BLOCK="" - local ctx ctx_path + local ctx ctx_path ctx_label for ctx in "$@"; do - ctx_path="${PROJECT_REPO_ROOT}/${ctx}" + case "$ctx" in + ops:*) + ctx_label="${ctx#ops:}" + ctx_path="${OPS_REPO_ROOT}/${ctx_label}" + ;; + *) + ctx_label="$ctx" + ctx_path="${PROJECT_REPO_ROOT}/${ctx}" + ;; + esac if [ -f "$ctx_path" ]; then CONTEXT_BLOCK="${CONTEXT_BLOCK} -### ${ctx} +### ${ctx_label} $(cat "$ctx_path") " fi done } -# ── Escalation reply consumption ───────────────────────────────────────── +# ── Ops repo helpers ───────────────────────────────────────────────── -# consume_escalation_reply AGENT_NAME -# Atomically consumes /tmp/{agent}-escalation-reply if it exists. -# Sets ESCALATION_REPLY to the file contents (empty string if no reply). -consume_escalation_reply() { - local agent="$1" - local reply_file="/tmp/${agent}-escalation-reply" - ESCALATION_REPLY="" - if [ -s "$reply_file" ]; then - local tmp_file="${reply_file}.consumed.$$" - if mv "$reply_file" "$tmp_file" 2>/dev/null; then - ESCALATION_REPLY=$(cat "$tmp_file") - rm -f "$tmp_file" - log "Consumed escalation reply: $(echo "$ESCALATION_REPLY" | head -1)" - fi +# ensure_ops_repo +# Clones or pulls the ops repo so agents can read/write operational data. +# Requires: OPS_REPO_ROOT, FORGE_OPS_REPO, FORGE_URL, FORGE_TOKEN. +# No-op if OPS_REPO_ROOT already exists and is up-to-date. +ensure_ops_repo() { + local ops_root="${OPS_REPO_ROOT:-}" + [ -n "$ops_root" ] || return 0 + + if [ -d "${ops_root}/.git" ]; then + # Pull latest from primary branch + git -C "$ops_root" fetch origin "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true + git -C "$ops_root" checkout "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true + git -C "$ops_root" pull --ff-only origin "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true + return 0 fi + + # Clone from Forgejo + local ops_repo="${FORGE_OPS_REPO:-}" + [ -n "$ops_repo" ] || return 0 + local forge_url="${FORGE_URL:-http://localhost:3000}" + local clone_url + if [ -n "${FORGE_TOKEN:-}" ]; then + local auth_url + auth_url=$(printf '%s' "$forge_url" | sed "s|://|://$(whoami):${FORGE_TOKEN}@|") + clone_url="${auth_url}/${ops_repo}.git" + else + clone_url="${forge_url}/${ops_repo}.git" + fi + + log "Cloning ops repo: ${ops_repo} -> ${ops_root}" + if git clone --quiet "$clone_url" "$ops_root" 2>/dev/null; then + log "Ops repo cloned: ${ops_root}" + else + log "WARNING: failed to clone ops repo ${ops_repo} — creating local directory" + mkdir -p "$ops_root" + fi +} + +# ops_commit_and_push MESSAGE [FILE ...] +# Stage, commit, and push changes in the ops repo. +# If no files specified, stages all changes. +ops_commit_and_push() { + local msg="$1" + shift + local ops_root="${OPS_REPO_ROOT:-}" + [ -d "${ops_root}/.git" ] || return 0 + + ( + cd "$ops_root" || return + if [ $# -gt 0 ]; then + git add "$@" + else + git add -A + fi + if ! git diff --cached --quiet; then + git commit -m "$msg" + git push origin "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true + fi + ) } # ── Session management ─────────────────────────────────────────────────── @@ -296,6 +350,7 @@ NEVER echo or include the actual token value in output — always reference \${F ## Environment FACTORY_ROOT=${FACTORY_ROOT} PROJECT_REPO_ROOT=${PROJECT_REPO_ROOT} +OPS_REPO_ROOT=${OPS_REPO_ROOT} PRIMARY_BRANCH=${PRIMARY_BRANCH} PHASE_FILE=${PHASE_FILE} diff --git a/lib/load-project.sh b/lib/load-project.sh index 1caa4a9..0ef6301 100755 --- a/lib/load-project.sh +++ b/lib/load-project.sh @@ -43,6 +43,10 @@ emit('FORGE_URL', cfg.get('forge_url', '')) if 'repo_root' in cfg: emit('PROJECT_REPO_ROOT', cfg['repo_root']) +if 'ops_repo_root' in cfg: + emit('OPS_REPO_ROOT', cfg['ops_repo_root']) +if 'ops_repo' in cfg: + emit('FORGE_OPS_REPO', cfg['ops_repo']) if 'primary_branch' in cfg: emit('PRIMARY_BRANCH', cfg['primary_branch']) @@ -99,4 +103,14 @@ if [ -z "${PROJECT_REPO_ROOT:-}" ] && [ -n "${PROJECT_NAME:-}" ]; then export PROJECT_REPO_ROOT="/home/${USER}/${PROJECT_NAME}" fi +# Derive OPS_REPO_ROOT if not explicitly set +if [ -z "${OPS_REPO_ROOT:-}" ] && [ -n "${PROJECT_NAME:-}" ]; then + export OPS_REPO_ROOT="/home/${USER}/${PROJECT_NAME}-ops" +fi + +# Derive FORGE_OPS_REPO if not explicitly set +if [ -z "${FORGE_OPS_REPO:-}" ] && [ -n "${FORGE_REPO:-}" ]; then + export FORGE_OPS_REPO="${FORGE_REPO}-ops" +fi + unset _PROJECT_TOML _PROJECT_VARS _key _val diff --git a/planner/AGENTS.md b/planner/AGENTS.md index 3303f88..9749afd 100644 --- a/planner/AGENTS.md +++ b/planner/AGENTS.md @@ -4,7 +4,7 @@ **Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints), executed directly from cron via tmux + Claude. Phase 0 (preflight): pull latest code, load persistent memory and prerequisite -tree from `planner/MEMORY.md` and `planner/prerequisite-tree.md`. Also reads +tree from `$OPS_REPO_ROOT/knowledge/planner-memory.md` and `$OPS_REPO_ROOT/prerequisites.md`. Also reads all available formulas: factory formulas (`$FACTORY_ROOT/formulas/*.toml`) and project-specific formulas (`$PROJECT_REPO_ROOT/formulas/*.toml`). Phase 1 (prediction-triage): triage `prediction/unreviewed` issues filed by the @@ -20,7 +20,7 @@ prerequisites, discover new ones, update the tree. **Also scans comments on referenced issues for bounce/stuck signals** (BOUNCED, LABEL_CHURN) to detect issues ping-ponging between backlog and underspecified. Issues that need human decisions or external resources are filed as vault procurement items -(`vault/pending/*.md`) instead of being escalated. Phase 3 +(`$OPS_REPO_ROOT/vault/pending/*.md`) instead of being escalated. Phase 3 (file-at-constraints): identify the top 3 unresolved prerequisites that block the most downstream objectives — file issues as either `backlog` (code changes, dev-agent) or `action` (run existing formula, action-agent). **Stuck issues @@ -28,18 +28,17 @@ dev-agent) or `action` (run existing formula, action-agent). **Stuck issues in breakdown mode instead of being re-promoted** — this breaks the ping-pong loop by splitting them into dev-agent-sized sub-issues. **Human-blocked issues are routed through the vault** — the planner files an actionable procurement -item (`vault/pending/<project>-<slug>.md` with What/Why/Human action/Factory +item (`$OPS_REPO_ROOT/vault/pending/<project>-<slug>.md` with What/Why/Human action/Factory will then sections) and marks the prerequisite as blocked-on-vault in the tree. Deduplication: checks pending/ + approved/ + fired/ before creating. Phase 4 (journal-and-memory): write updated prerequisite tree + daily journal -entry (committed to git) and update `planner/MEMORY.md` (committed to git). -Phase 5 (commit-and-pr): one commit with all file changes, push, create PR. +entry (committed to ops repo) and update `$OPS_REPO_ROOT/knowledge/planner-memory.md`. +Phase 5 (commit-ops): commit all ops repo changes, push directly. AGENTS.md maintenance is handled by the Gardener. -**Artifacts use `$PROJECT_REPO_ROOT`**: All planner artifacts (journal, -prerequisite tree, memory, vault state) live under `$PROJECT_REPO_ROOT/planner/` -and `$PROJECT_REPO_ROOT/vault/`, not `$FACTORY_ROOT`. Each project manages its -own planner state independently. +**Artifacts use `$OPS_REPO_ROOT`**: All planner artifacts (journal, +prerequisite tree, memory, vault state) live under `$OPS_REPO_ROOT/`. +Each project manages its own planner state in a separate ops repo. **Trigger**: `planner-run.sh` runs daily via cron (accepts an optional project TOML argument, defaults to `projects/disinto.toml`). Sources `lib/guard.sh` and @@ -60,12 +59,12 @@ component, not work. - `formulas/groom-backlog.toml` — Dual-mode formula: grooming (default) or breakdown (dispatched by planner for bounced/stuck issues — splits the issue into dev-agent-sized sub-issues, removes `underspecified` label) -- `planner/prerequisite-tree.md` — Prerequisite tree: versioned constraint +- `$OPS_REPO_ROOT/prerequisites.md` — Prerequisite tree: versioned constraint map linking VISION.md objectives to their prerequisites. Planner owns the tree, humans steer by editing VISION.md. Tree grows organically as the planner discovers new prerequisites during runs -- `planner/MEMORY.md` — Persistent memory across runs (committed to git) -- `planner/journal/*.md` — Daily raw logs from each planner run (committed to git) +- `$OPS_REPO_ROOT/knowledge/planner-memory.md` — Persistent memory across runs (in ops repo) +- `$OPS_REPO_ROOT/journal/planner/*.md` — Daily raw logs from each planner run (in ops repo) **Constraint focus**: The planner uses Theory of Constraints to avoid premature issue filing. Only the top 3 unresolved prerequisites that block the most @@ -74,5 +73,5 @@ prerequisite tree but NOT as issues. This prevents the "spray issues across all milestones" pattern that produced premature work in planner v1/v2. **Environment variables consumed**: -- `FORGE_TOKEN`, `FORGE_PLANNER_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT` +- `FORGE_TOKEN`, `FORGE_PLANNER_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT`, `OPS_REPO_ROOT` - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to opus by planner-run.sh) diff --git a/planner/MEMORY.md b/planner/MEMORY.md deleted file mode 100644 index 706dc25..0000000 --- a/planner/MEMORY.md +++ /dev/null @@ -1,45 +0,0 @@ -<!-- summarized-through-run: 6 --> -# Planner Memory - -## 2026-03-26 — Sixth planner run - -### Milestone state -- **Foundation**: COMPLETE. All agent loops, supervisor, planner, multi-project, knowledge graph, predictor-planner feedback loop — all working. -- **Adoption**: 4/5 COMPLETE. Bootstrap (#393), docs (#394), dashboard (#395), landing page (#534) all done. Only #466 (example project) remains — stuck on human decision since 2026-03-23. -- **Ship (Fold 2)**: ENTERING SCOPE. Rent-a-human (#679) done. Exec agent (#699) done. Observable addressables (#718) filed. Deploy profiles and assumptions register not yet tracked. -- **Scale**: DEFERRED. No external users yet. Plugin system, community formulas, hosted option all premature. - -### Completed since last summary (runs 2-6) -- Bootstrap fully hardened: init smoke test (#668), CI wiring (#661), Forgejo reachability (#660), 10+ bootstrap fixes -- Full stack containerized (#618, #619) with Forgejo, Woodpecker, Dendrite -- Autonomous merge pipeline (#568) — PRs auto-merge on CI pass + approval -- Unified escalation path (#510) — PHASE:escalate replaces needs_human -- Factory operational reliability — guard logging (#663), stale phase cleanup (#664) -- Prediction/backlog killed (#686) — planner now only ACTIONs or DISMISSes predictions -- Planner v2 — graph-driven formula (#667), tea CLI integration (#666) -- Exec agent (#699) — interactive assistant via Matrix -- Rent-a-human (#679) — formula-dispatchable human action drafts -- Tech-debt queue cleared (~30 items) -- Skill package initiative started (#710-#715) from research (#709) - -### Patterns -- **Label loss resolved**: #535 fixed the recurring label-loss pattern. Labels now persist reliably. -- **Predictor signal quality improved**: Later runs show 100% substantive predictions. Over-signaling on transient ops issues has stopped. -- **Human bottleneck is real**: #466 escalated 2026-03-23, still no response after 3 days. When the factory needs human input and doesn't get it, work halts on that branch entirely. -- **Factory throughput is extreme when unblocked**: 50+ issues cleared in ~5 days (2026-03-20 to 2026-03-25). Pipeline processes ~10 issues/day when backlog is stocked. -- **Duplicate issues from parallel creation**: #710/#714 and #711/#715 are duplicates — likely created in separate exec/research sessions. Gardener should catch these. -- **prediction/backlog migration**: All 4 legacy prediction/backlog items dismissed and closed in run 6. prediction/dismissed label created. - -### Strategic direction -- Ship milestone is the next frontier. Adoption is blocked only on #466 (human decision). -- Skill package distribution (#710→#711→#712) is the immediate pipeline work — packaging disinto for external discovery. -- Observable addressables (#718) bridges Fold 2 → Fold 3 — core vision item. -- The factory has the exec agent (#699) and rent-a-human (#679) — two vision capabilities now live. -- VISION.md updated with factory primitives (resources, addressables, observables) — formalizes the framework. - -### Watch list -- #466: human response overdue (3 days) — will it ever be unblocked? -- #710-#712: skill package pipeline — first new work direction since Adoption -- #714/#715: duplicate cleanup by gardener -- prediction/backlog label: should be deleted per #686, still exists -- Ship milestone gaps: deploy profiles, assumptions register, vault-gated folds — not yet filed diff --git a/planner/journal/.gitkeep b/planner/journal/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/planner/journal/2026-03-21.md b/planner/journal/2026-03-21.md deleted file mode 100644 index 414d566..0000000 --- a/planner/journal/2026-03-21.md +++ /dev/null @@ -1,65 +0,0 @@ -# Planner run — 2026-03-21 09:29 UTC - -## Predictions triaged -- #455: DISMISS — orphaned gardener session, transient, supervisor's job -- #454: DISMISS — crashed review session, transient, supervisor recovers -- #449: DISMISS — legacy predictor duplication, already tracked by #419 -- #448: WATCH — disk at 75% (was 79% last run), improving trend, supervisor monitors -- #447: DISMISS — swap at 52%, expected behavior with memory guards -- #446: WATCH — harb pipeline stalled 8h on needs_human, supervisor didn't escalate - -## Issues created -- #465: feat: supervisor escalates prolonged PHASE:needs_human states — revealed by #446 pattern, prevents silent pipeline stalls -- #466: feat: example project demonstrating the full Disinto lifecycle — VISION.md Adoption gap, needed by docs and landing page - -## Label fixes -- #393 (disinto init): added backlog label — was created last run but lost its label -- #394 (quickstart docs): added backlog label — same issue -- #395 (metrics dashboard): added backlog label — same issue - -## Observations -- Predictor continues to over-signal on transient operational issues (4/6 predictions were transient tmux/session issues the supervisor already handles). Pattern from last run confirmed. -- Adoption issues from last planner run (#393/#394/#395) existed but had NO labels. The dev-agent only picks up backlog-labeled issues. Root cause unclear — either the label API call failed silently during creation, or labels were removed. Fixed this run. -- Foundation milestone remains complete. Adoption still the bottleneck — no progress since last run because issues weren't in the backlog. -- Tech-debt and small backlog items (~20) will be processed before Adoption features due to sequential pipeline and lower issue numbers. -- #357 (action-agent runtime isolation) is in-progress — active dev work happening. - -## Deferred -- Scale milestone (multi-project works, plugin system premature without users) -- Evidence pipeline (harb-specific, blocked on #1047) -- Production halt/resume (#1) — far future, no users to protect yet -- Multi-VPS (#4) — premature, single server handles current load -- Adding backlog labels to #462 (PHASE:escalate) and #291 (secrets in formulas) — both valid but not highest leverage this cycle - ---- - -# Planner run — 2026-03-21 10:05 UTC - -## Predictions triaged -No unreviewed predictions. - -## Issues created -No new issues — backlog is well-stocked (~30 open items) and aligned with VISION.md. - -## Label fixes -- #291 (secrets in formulas): added backlog label — deferred last run, now promoted -- #289 (gardener creates investigation issues for closed escalations): added backlog label -- #462 (PHASE:escalate): added backlog label — complements #465 - -## Closures -- #144 (reschedule planner after predictor): closed — already implemented (predictor 06:00, planner weekly) - -## Observations -- #357 (action-agent runtime isolation) COMPLETED since last run — merged via PR #464. Watch item resolved. -- #360 (no relabeling on DISMISS) also completed — merged via PR #468. -- Label persistence: #393/#394/#395 retain their backlog labels. Watch item resolved — the label loss from last run was a one-time issue. -- #361 (planner journal pattern) is in-progress — active dev work. -- Backlog queue still deep: ~20 tech-debt and small fixes sit ahead of the 4 Adoption features (#393/#394/#395/#466). Sequential pipeline means Adoption work is weeks out unless manually prioritized. -- Three previously unlabeled issues (#291, #289, #462) were invisible to the dev-agent. Now labeled as backlog. Last run deferred #291 and #462; this run is the right time to make them visible since the pipeline needs work to process. - -## Deferred -- Scale milestone (plugin system, hosted option, community formulas — premature without users) -- Multi-VPS (#4) — single server handles current load -- Production halt/resume (#1) — no users to protect yet -- Self-tuning wake parameters (#2) — current static cron schedule works fine -- Memory update — only 2 runs since last summarization, threshold is 5 diff --git a/planner/journal/2026-03-22.md b/planner/journal/2026-03-22.md deleted file mode 100644 index a943ade..0000000 --- a/planner/journal/2026-03-22.md +++ /dev/null @@ -1,38 +0,0 @@ -# Planner run — 2026-03-22 07:02 UTC - -## Predictions triaged -- #528: DISMISS — missing backlog label on #466. Fixed directly by adding the label. Recurring label-loss pattern (4/4 planner-created issues affected). Closed. -- #529: WATCH — swap at 57%, up from 52% (#447). Available RAM (4385MB) still above 2000MB threshold. Monitoring upward trend. - -## Prerequisite tree updates -- Resolved: #393, #394, #395, #510, #504, #516, #514 all closed since last run -- Objectives completed: 5 moved to DONE (init, docs, dashboard, escalation, vault) -- Objectives ready: #466 (example project), #534 (landing page) — both in backlog - -## Top 3 constraints -1. #466 (example project) — blocks Adoption completion → delays Scale — issue already in backlog -2. #534 (landing page value proposition) — blocks Growth goals visibility — issue filed this run -3. #535 (label-loss on planner-created issues) — cross-cutting reliability — issue filed this run - -## Issues created -- #534: feat: landing page communicates value proposition clearly — Adoption milestone gap, Growth goals -- #535: fix: planner-created issues lose backlog labels on creation — 4/4 failure rate, delays pipeline activation - -## Label fixes -- #466: added backlog label — confirmed prediction #528 was correct, recurring pattern - -## Observations -- Explosive progress: ~50 issues closed between 2026-03-20 and 2026-03-22. Foundation and most of Adoption now complete. -- Adoption milestone nearly done: only #466 (example project) and #534 (landing page) remain. Both are READY with no blocking prerequisites. -- Tech-debt queue (9 items: #93, #110, #179, #310, #311, #330, #429, #433, #435) has lower issue numbers than #466, so sequential pipeline will process them first. -- #531 (dev-poll direct merges) is in-progress — improves pipeline throughput for approved PRs. -- Vault infrastructure deployed but empty (no pending/approved/fired items). Procurement gate available when Scale needs resources. -- Label-loss pattern confirmed as persistent: 4/4 planner-created issues lost backlog labels. Filed #535 to investigate and fix. -- Predictor signal quality improving: 2 predictions this run (vs 6 last run), both actionable. -- Scale milestone remains premature — no users yet. Plugin system, community formulas, hosted option all deferred. - -## Deferred (in tree, not filed) -- Scale: plugin system, community formulas, hosted option — premature without users -- Vision items: #1 (halt/resume), #2 (self-tuning), #4 (multi-VPS) — far future -- Prediction #448 (disk 75%): still in prediction/backlog, trend was improving -- Prediction #446 (harb stall): supervisor escalation (#465/#510) now in place, should prevent recurrence diff --git a/planner/journal/2026-03-23.md b/planner/journal/2026-03-23.md deleted file mode 100644 index 27f7cc2..0000000 --- a/planner/journal/2026-03-23.md +++ /dev/null @@ -1,44 +0,0 @@ -# Planner run — 2026-03-23 07:15 UTC - -## Predictions triaged -- #583: DISMISS — #568 (merge guard blocker) already exists as an open issue. Fixed labels directly (added backlog+priority). -- #582: DISMISS — backlog depletion is expected; this planner run is the replenishment cycle. Factory cleared 20+ issues in 48h. -- #581: DISMISS — Gitea CVEs are Codeberg's upstream infrastructure. Disinto can't upgrade their Gitea. RCE requires repo-template processing (not in our workflow). Auto-merge cancellation mitigated by our review-agent flow. -- #580: WATCH — Caddy CVEs. disinto.ai is a static site without FastCGI/PHP, so the HIGH RCE (CVE-2026-27590) doesn't apply. Medium CVEs low risk. No system-upgrade formula available. - -## Prerequisite tree updates -- Resolved: #534 (landing page) → DONE, #535 (label-loss fix) → resolved -- Discovered: #568 (merge guard blocker) added as new objective — every PR merge escalates, blocking full pipeline autonomy -- Status change: #466 remains READY, #534 moved to DONE - -## Top 3 constraints -1. #568 — PreToolUse guard blocks merge — affects every PR across all agents — issue already open, added backlog+priority -2. #466 — example project (last Adoption item) — blocks Adoption completion — issue already open, added backlog+priority -3. Tech-debt backlog visibility — 9 items invisible to dev-poll — fixed by adding backlog labels to all 9 - -## Issues created -No new issues — all constraints already have existing issues. - -## Priority label changes -- Added priority: #568, #466 (top 2 constraints) -- No priority labels removed (clean set) - -## Label fixes -- #568: added backlog + priority (was unlabeled, invisible to dev-poll) -- #466: added backlog + priority (label-loss recurrence — 5th time this pattern appears) -- #93, #110, #179, #310, #311, #330, #429, #433, #435: added backlog label to all 9 tech-debt items - -## Observations -- Explosive throughput confirmed: factory cleared entire backlog (20+ issues, 19+ PRs) in ~48h. The predictor correctly flagged the empty state (#582). -- Label-loss persists despite #535 fix: #466 lost its label AGAIN. The #535 fix addressed planner-created label application, but #466 was created before that fix. Root cause may be that the original label was never applied, or was stripped by gardener quality gate (the issue body does have acceptance criteria, so the gate shouldn't strip it). -- Merge guard (#568) is the #1 factory constraint: every PR requires human merge intervention. Dev-poll's try_direct_merge() catches approved PRs eventually, but with delay and false escalations. This should be fixed before the factory tackles #466 (which will generate multiple PRs). -- Adoption milestone nearly complete: 4/5 objectives DONE (#393, #394, #395, #534). Only #466 remains. -- Scale milestone remains premature — no external users yet. Plugin system, community formulas, hosted option all deferred. -- Vault infrastructure deployed but empty — no procurement requests needed this cycle. -- RESOURCES.md unchanged since last run. - -## Deferred (in tree, not filed) -- Scale: plugin system, community formulas, hosted option — premature without users -- Vision items: #1 (halt/resume), #2 (self-tuning), #4 (multi-VPS) — far future -- Prediction #529 (swap 57%): still in prediction/backlog, stable -- Prediction #580 (Caddy CVEs): watching, static site mitigates RCE diff --git a/planner/journal/2026-03-25.md b/planner/journal/2026-03-25.md deleted file mode 100644 index 0df3e09..0000000 --- a/planner/journal/2026-03-25.md +++ /dev/null @@ -1,53 +0,0 @@ -# Planner run — 2026-03-25 07:15 UTC - -## Predictions triaged -- #656: DISMISS — planning deadlock is resolved by this run; tree staleness corrected -- #655: PROMOTE_BACKLOG → #663 — check_active guard should log when skipping -- #644: WATCH — disk P1 reactive cleanup works; not urgent enough for backlog slot -- #643: PROMOTE_BACKLOG → #664 — supervisor should clean stale phase files for closed issues -- #642: DISMISS — HTTP 401 likely caused by #653 (wrong remote), now fixed -- #640: DISMISS — all 5 bootstrap failures (#634-638) closed; remaining fixes in pipeline - -## Prerequisite tree updates -- Resolved: #568 (merge guard) moved from BLOCKED to DONE — was closed but tree was stale -- Resolved: bootstrap hardening issues #634-638, #652, #653, #658 all closed -- Discovered: #660 (Forgejo reachability) and #661 (Woodpecker CI wiring) as remaining init prerequisites -- Added: new objective "Factory operational reliability" with #663 and #664 -- Added: #668 (end-to-end init smoke test) as init prerequisite -- Status change: #466 marked ESCALATED (bounced + gardener escalation, awaiting human decision) - -## Top 5 constraints -1. #466 — example project — blocks Adoption completion — ESCALATED, awaiting human decision -2. #661 — Woodpecker CI wiring — blocks init completeness — in backlog with priority -3. #668 — init smoke test — blocks init quality assurance — filed this run -4. #663 — guard logging — prevents invisible agent dropouts — filed this run (from #655) -5. #664 — stale phase cleanup — reduces supervisor signal noise — filed this run (from #643) - -## Stuck issues detected -- #466: BOUNCED (1x, "too large for single session") + ESCALATED (gardener: "needs human decision on approach") — added comment noting escalation seen, suggested option (b) local demo may be viable - -## Issues created -- #663: fix: check_active guard should log to stderr when skipping (from prediction #655) -- #664: fix: supervisor should clean up stale PHASE:escalate files (from prediction #643) -- #668: feat: end-to-end disinto init smoke test in CI (new constraint — init quality) - -## Priority label changes -- Added priority: #661, #663, #664, #668 (top constraints) -- Kept priority: #466 (still #1 constraint, though escalated) -- No priority removed (only #466 had it previously) - -## Observations -- Massive progress since last run (2026-03-23): ~30 issues closed in 48h. Bootstrap hardening wave (#634-638, #652, #653, #658) completed. Full stack containerized (#618, #619). -- Planner missed 2026-03-24 run due to active-state guard deploy gap (#655). State files created 21h after guard merged. No visible signal of the missed run. -- Factory nearly idle: 1 in-progress (#660), 1 backlog (#661), plus 3 newly filed. After current pipeline clears, #466 is the only Adoption item left. -- Adoption milestone 4/5 complete. #466 is stuck on human decision (external vs local demo). The containerized stack work makes option (b) viable — suggested in comment. -- #568 (merge guard) was marked BLOCKED in tree but actually closed — 2-day stale tree from missed planner run. -- Predictor signal quality: 6 predictions, all substantive. 2 promoted, 1 watched, 3 dismissed. Better signal-to-noise than earlier runs. -- RESOURCES.md unchanged. Vault empty (no procurement requests). - -## Deferred (in tree, not filed) -- Scale: plugin system, community formulas, hosted option — premature without users -- Vision items: #1 (halt/resume), #2 (self-tuning), #4 (multi-VPS) — far future -- Prediction #644 (disk P1): watching, reactive cleanup works -- Prediction #580 (Caddy CVEs): still in prediction/backlog, static site mitigates -- Prediction #529 (swap 57%): stable, not trending worse diff --git a/planner/journal/2026-03-26.md b/planner/journal/2026-03-26.md deleted file mode 100644 index 36f0682..0000000 --- a/planner/journal/2026-03-26.md +++ /dev/null @@ -1,54 +0,0 @@ -# Planner run — 2026-03-26 07:15 UTC - -## Predictions triaged -- #644: DISMISS — disk P1 handled by supervisor reactive cleanup, no persistent issue -- #580: DISMISS — Caddy CVEs don't apply to static site (no FastCGI/PHP) -- #529: DISMISS — swap stable at 57%, supervisor monitors, not trending worse -- #446: DISMISS — root cause fixed by #465/#510 escalation path improvements -All 4 were prediction/backlog → migrated to prediction/dismissed per #686 policy. Created prediction/dismissed label (was missing). - -## Prerequisite tree updates -- Resolved: #660 (Forgejo reachability), #661 (Woodpecker CI wiring), #668 (init smoke test), #663 (guard logging), #664 (stale phase cleanup) — all closed -- Objectives completed: bootstrap (#393) now FULLY DONE (all prereqs resolved), factory operational reliability DONE -- New objectives added: exec agent (#699) DONE, rent-a-human (#679) DONE, skill package distribution (#710-#712), observable addressables (#718) -- Discovered: #710-#715 skill package initiative (created since last run), with #714/#715 as duplicates of #710/#711 - -## Top 5 constraints -1. #466 — example project — blocks Adoption completion — ESCALATED 3 days, no human response -2. #710 — skill package creation — enables distribution — in backlog with priority -3. #718 — observable addressables — Ship milestone bridge — filed this run -4. #714/#715 — duplicate issues — pending gardener cleanup -5. prediction/backlog label — should be deleted per #686 — needs admin action - -## Stuck issues detected -- #466: ESCALATED (since 2026-03-23, 3 days) — no human response. Dev-agent bounced as too large, gardener escalated for approach decision. Not re-promoting — already has priority label. - -## Issues created -- #718: feat: observable addressables — engagement measurement for deployed artifacts — Ship milestone, Fold 2→3 bridge - -## Priority label changes -- Added priority: #710 (next ready pipeline work) -- Kept priority: #466 (still #1 constraint, escalated) -- No priority removed - -## Label changes -- #710, #711, #712, #713: added backlog (were unlabeled, invisible to pipeline) -- #644, #580, #529, #446: relabeled from prediction/backlog to prediction/dismissed, closed -- Created prediction/dismissed label (id: 1335444) — missing despite #686 implementation - -## Observations -- Explosive progress continues: ~30 issues closed since last run (2026-03-25). Bootstrap fully hardened, init smoke test passing, exec agent deployed, rent-a-human implemented. -- Factory nearly idle: no backlog items existed until this run labeled #710-#713. The skill package chain is the only ready work. -- Skill package initiative (#710-#715) appeared since last run — created from #709 research. Two parallel tracks with duplicates (#710/#714, #711/#715). Preferred #710 chain, flagged duplicates for gardener. -- Adoption milestone 4/5 complete. #466 stuck 3 days on human decision. All technical prerequisites resolved. The human bottleneck is real. -- Ship milestone entering scope: #679 (rent-a-human) and #699 (exec agent) already done. #718 (observable addressables) filed. Deploy profiles and assumptions register remain untracked. -- VISION.md updated this cycle (3c97ddb) — factory primitives (resources, addressables, observables) now formally defined. Tree reflects this. -- RESOURCES.md unchanged. Vault empty (no procurement requests). -- Graph report clean: no cycles, no bottlenecks. Orphan issues are all closed bug fixes — expected. - -## Deferred (in tree, not filed) -- Ship: deploy profiles per artifact type — premature until skill package or example project demonstrates need -- Ship: assumptions register — needs design decision on format -- Ship: vault-gated fold transitions — vault infrastructure exists, fold logic not yet designed -- Scale: plugin system, community formulas, hosted option — premature without users -- Vision items: #1 (halt/resume), #2 (self-tuning), #4 (multi-VPS) — far future diff --git a/planner/planner-run.sh b/planner/planner-run.sh index ab7d987..8da3b8b 100755 --- a/planner/planner-run.sh +++ b/planner/planner-run.sh @@ -48,30 +48,33 @@ log "--- Planner run start ---" # ── Load formula + context ─────────────────────────────────────────────── load_formula "$FACTORY_ROOT/formulas/run-planner.toml" -build_context_block VISION.md AGENTS.md RESOURCES.md planner/prerequisite-tree.md +build_context_block VISION.md AGENTS.md ops:RESOURCES.md ops:prerequisites.md # ── Build structural analysis graph ────────────────────────────────────── build_graph_section +# ── Ensure ops repo is available ─────────────────────────────────────── +ensure_ops_repo + # ── Read planner memory ───────────────────────────────────────────────── MEMORY_BLOCK="" -MEMORY_FILE="$PROJECT_REPO_ROOT/planner/MEMORY.md" +MEMORY_FILE="$OPS_REPO_ROOT/knowledge/planner-memory.md" if [ -f "$MEMORY_FILE" ]; then MEMORY_BLOCK=" -### planner/MEMORY.md (persistent memory from prior runs) +### knowledge/planner-memory.md (persistent memory from prior runs) $(cat "$MEMORY_FILE") " fi # ── Read recent journal files ────────────────────────────────────────── JOURNAL_BLOCK="" -JOURNAL_DIR="$PROJECT_REPO_ROOT/planner/journal" +JOURNAL_DIR="$OPS_REPO_ROOT/journal/planner" if [ -d "$JOURNAL_DIR" ]; then # Load last 5 journal files (most recent first) for run history context JOURNAL_FILES=$(find "$JOURNAL_DIR" -name '*.md' -type f | sort -r | head -5) if [ -n "$JOURNAL_FILES" ]; then JOURNAL_BLOCK=" -### Recent journal entries (planner/journal/) +### Recent journal entries (journal/planner/) " while IFS= read -r jf; do JOURNAL_BLOCK="${JOURNAL_BLOCK} diff --git a/planner/prerequisite-tree.md b/planner/prerequisite-tree.md deleted file mode 100644 index 694c6e8..0000000 --- a/planner/prerequisite-tree.md +++ /dev/null @@ -1,77 +0,0 @@ -# Prerequisite Tree -<!-- Last updated: 2026-03-26 --> - -## Objective: One-command bootstrap — `disinto init` (#393) -- [x] Core agent loop stable (Foundation) -- [x] Multi-project support (Foundation) -- [x] Guard allows formula agents in worktrees (#487) -- [x] Bundled dust cleanup — set-euo-pipefail (#516) -- [x] Agent-session.sh pre-register worktree trust (#514) -- [x] Bootstrap hardening — Forgejo INSTALL_LOCK (#634), su-exec (#635), admin user (#636), DNS (#637), crontab (#638), auth (#652), remote target (#653), token creation (#658) -- [x] Agents container reaches Forgejo — env.sh override (#660) -- [x] Woodpecker CI wiring during init (#661) -- [x] End-to-end init smoke test (#668) -Status: DONE — all prerequisites resolved, init fully functional - -## Objective: Documentation site with quickstart (#394) -- [x] disinto init working (#393) -Status: DONE — #394 closed - -## Objective: Metrics dashboard (#395) -- [x] disinto init working (#393) -- [x] Supervisor formula stable -Status: DONE — #395 closed - -## Objective: Example project demonstrating full lifecycle (#466) -- [x] disinto init working (#393) -- [ ] Human decision on implementation approach (external repo vs local demo) — blocked-on-vault -Status: BLOCKED — bounced by dev-agent (too large), routed to vault for human decision - -## Objective: Landing page communicating value proposition (#534) -- [x] disinto init working (#393) -- [x] Documentation site live (#394) -- [x] Planner-created issues retain labels reliably (#535) -Status: DONE — #534 closed - -## Objective: Autonomous PR merge pipeline (#568) -- [x] PreToolUse guard allows merge API calls from phase-handler (#568) -Status: DONE — #568 closed - -## Objective: Unified escalation path (#510) -- [x] PHASE:escalate replaces PHASE:needs_human (supersedes #465) -Status: DONE — #510 closed - -## Objective: Vault as procurement gate + RESOURCES.md inventory (#504) -- [x] RESOURCES.md exists -- [x] Vault poll scripts deployed (vault-poll.sh) -Status: DONE — #504 closed - -## Objective: Factory operational reliability -- [x] check_active guard logs when skipping (#663) -- [x] Supervisor cleans stale PHASE:escalate files (#664) -Status: DONE — both fixes merged - -## Objective: Exec agent — interactive executive assistant (#699) -- [x] Matrix bot infrastructure -- [x] CHARACTER.md personality definition -- [x] exec-session.sh implementation -Status: DONE — #699 closed - -## Objective: Rent-a-human — formula-dispatchable human action drafts (#679) -- [x] Formula infrastructure (run-rent-a-human.toml) -- [x] Vault gating for human actions -Status: DONE — #679 closed - -## Objective: Skill package distribution (#710 → #711 → #712) -- [ ] Create disinto skill package — SKILL.md + helper scripts (#710) — in backlog, priority -- [ ] Publish to ClawHub registry (#711) — in backlog, depends on #710 -- [ ] Submit to secondary registries (#712) — in backlog, depends on #711 -- [ ] Evaluate MCP server wrapper (#713) — in backlog, independent -- Note: #714, #715 flagged as duplicates of #710, #711 — pending gardener cleanup -Status: READY — no blocking prerequisites - -## Objective: Observable addressables — engagement measurement (#718) -- [ ] Lightweight analytics on disinto.ai (#718) — in backlog -- [ ] Deploy formula verifies measurement is live -- [ ] Planner consumes engagement data -Status: READY — Ship milestone, Fold 2 → Fold 3 bridge diff --git a/predictor/AGENTS.md b/predictor/AGENTS.md index a7bb540..327a842 100644 --- a/predictor/AGENTS.md +++ b/predictor/AGENTS.md @@ -15,7 +15,7 @@ The predictor's own prediction history (open + closed issues) serves as its memory — it reviews what was actioned, dismissed, or deferred to decide where to focus next. No hardcoded signal categories; Claude decides where to look based on available data: prerequisite tree, evidence directories, VISION.md, -RESOURCES.md, open issues, agent logs, and external signals (via web search). +RESOURCES.md (from ops repo), open issues, agent logs, and external signals (via web search). Files up to 5 actions per run (predictions + dispatches combined). Each exploit counts as 2 (prediction + action dispatch). The predictor MUST NOT @@ -41,11 +41,11 @@ RAM < 2000 MB). interactive session **Environment variables consumed**: -- `FORGE_TOKEN`, `FORGE_PREDICTOR_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT` +- `FORGE_TOKEN`, `FORGE_PREDICTOR_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT`, `OPS_REPO_ROOT` - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by predictor-run.sh) **Lifecycle**: predictor-run.sh (daily 06:00 cron) → lock + memory guard → -load formula + context (AGENTS.md, RESOURCES.md, VISION.md, prerequisite-tree.md) +load formula + context (AGENTS.md, VISION.md from code repo; RESOURCES.md, prerequisites.md from ops repo) → create tmux session → Claude fetches prediction history (open + closed) → reviews track record (actioned/dismissed/watching) → finds weaknesses (prerequisite tree gaps, thin evidence, stale watches, external risks) → diff --git a/predictor/predictor-run.sh b/predictor/predictor-run.sh index 504e377..fd6f859 100755 --- a/predictor/predictor-run.sh +++ b/predictor/predictor-run.sh @@ -50,7 +50,7 @@ log "--- Predictor run start ---" # ── Load formula + context ─────────────────────────────────────────────── load_formula "$FACTORY_ROOT/formulas/run-predictor.toml" -build_context_block AGENTS.md RESOURCES.md VISION.md planner/prerequisite-tree.md +build_context_block AGENTS.md ops:RESOURCES.md VISION.md ops:prerequisites.md # ── Build structural analysis graph ────────────────────────────────────── build_graph_section diff --git a/projects/disinto.toml.example b/projects/disinto.toml.example index 3cdebc2..ea0b8c5 100644 --- a/projects/disinto.toml.example +++ b/projects/disinto.toml.example @@ -5,8 +5,10 @@ name = "disinto" repo = "johba/disinto" +ops_repo = "johba/disinto-ops" forge_url = "http://localhost:3000" repo_root = "/home/YOU/dark-factory" +ops_repo_root = "/home/YOU/disinto-ops" primary_branch = "main" [ci] diff --git a/site/collect-engagement.sh b/site/collect-engagement.sh index 38e3d5f..6430197 100644 --- a/site/collect-engagement.sh +++ b/site/collect-engagement.sh @@ -32,8 +32,8 @@ log() { # Caddy structured access log (JSON lines) CADDY_LOG="${CADDY_ACCESS_LOG:-/var/log/caddy/access.log}" -# Evidence output directory (committed to git) -EVIDENCE_DIR="${FACTORY_ROOT}/evidence/engagement" +# Evidence output directory (committed to ops repo) +EVIDENCE_DIR="${OPS_REPO_ROOT}/evidence/engagement" # Report date — defaults to today REPORT_DATE=$(date -u +%Y-%m-%d) diff --git a/skill/SKILL.md b/skill/SKILL.md index 17412b8..4077ae0 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -324,7 +324,7 @@ Read `VISION.md` at the repo root for the full vision. Then cross-reference with the prerequisite tree: ```bash -cat "${PROJECT_REPO_ROOT}/planner/prerequisite-tree.md" +cat "${OPS_REPO_ROOT}/prerequisites.md" ``` The prerequisite tree maps vision objectives to concrete issues. Items marked diff --git a/skill/scripts/read-journal.sh b/skill/scripts/read-journal.sh index 4e4619e..78bd787 100755 --- a/skill/scripts/read-journal.sh +++ b/skill/scripts/read-journal.sh @@ -41,7 +41,7 @@ while [[ $# -gt 0 ]]; do esac done -: "${PROJECT_REPO_ROOT:?PROJECT_REPO_ROOT is required}" +: "${OPS_REPO_ROOT:?OPS_REPO_ROOT is required}" if [[ -z "$agent" ]]; then echo "Error: agent name is required (planner, supervisor, predictor)" >&2 @@ -51,8 +51,8 @@ fi # --- Resolve journal directory --- case "$agent" in - planner) journal_dir="${PROJECT_REPO_ROOT}/planner/journal" ;; - supervisor) journal_dir="${PROJECT_REPO_ROOT}/supervisor/journal" ;; + planner) journal_dir="${OPS_REPO_ROOT}/journal/planner" ;; + supervisor) journal_dir="${OPS_REPO_ROOT}/journal/supervisor" ;; predictor) echo "The predictor does not write journal files." echo "Its memory lives in forge issues labeled 'prediction/unreviewed' and 'prediction/actioned'." diff --git a/supervisor/AGENTS.md b/supervisor/AGENTS.md index 74d8187..322ab4b 100644 --- a/supervisor/AGENTS.md +++ b/supervisor/AGENTS.md @@ -31,10 +31,9 @@ runs directly from cron like the planner and predictor. - `formulas/run-supervisor.toml` — Execution spec: five steps (preflight review, health-assessment, decide-actions, report, journal) with `needs` dependencies. Claude evaluates all metrics and takes actions in a single interactive session -- `supervisor/journal/*.md` — Daily health logs from each supervisor run (local, - committed periodically) +- `$OPS_REPO_ROOT/journal/supervisor/*.md` — Daily health logs from each supervisor run - `supervisor/PROMPT.md` — Best-practices reference for remediation actions -- `supervisor/best-practices/*.md` — Domain-specific remediation guides (memory, +- `$OPS_REPO_ROOT/knowledge/*.md` — Domain-specific remediation guides (memory, disk, CI, git, dev-agent, review-agent, forge) - `supervisor/supervisor-poll.sh` — Legacy bash orchestrator (superseded by supervisor-run.sh + formula) @@ -43,7 +42,7 @@ runs directly from cron like the planner and predictor. P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping). **Environment variables consumed**: -- `FORGE_TOKEN`, `FORGE_SUPERVISOR_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT` +- `FORGE_TOKEN`, `FORGE_SUPERVISOR_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT`, `OPS_REPO_ROOT` - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by supervisor-run.sh) - `WOODPECKER_TOKEN`, `WOODPECKER_SERVER`, `WOODPECKER_DB_PASSWORD`, `WOODPECKER_DB_USER`, `WOODPECKER_DB_HOST`, `WOODPECKER_DB_NAME` — CI database queries diff --git a/supervisor/PROMPT.md b/supervisor/PROMPT.md index a7d1725..7381785 100644 --- a/supervisor/PROMPT.md +++ b/supervisor/PROMPT.md @@ -15,14 +15,14 @@ You are the supervisor agent for `$FORGE_REPO`. You were called because Fix the issue yourself. You have full shell access and `--dangerously-skip-permissions`. -Before acting, read the relevant best-practices file: -- Memory issues → `cat ${FACTORY_ROOT}/supervisor/best-practices/memory.md` -- Disk issues → `cat ${FACTORY_ROOT}/supervisor/best-practices/disk.md` -- CI issues → `cat ${FACTORY_ROOT}/supervisor/best-practices/ci.md` -- forge / rate limits → `cat ${FACTORY_ROOT}/supervisor/best-practices/forge.md` -- Dev-agent issues → `cat ${FACTORY_ROOT}/supervisor/best-practices/dev-agent.md` -- Review-agent issues → `cat ${FACTORY_ROOT}/supervisor/best-practices/review-agent.md` -- Git issues → `cat ${FACTORY_ROOT}/supervisor/best-practices/git.md` +Before acting, read the relevant knowledge file from the ops repo: +- Memory issues → `cat ${OPS_REPO_ROOT}/knowledge/memory.md` +- Disk issues → `cat ${OPS_REPO_ROOT}/knowledge/disk.md` +- CI issues → `cat ${OPS_REPO_ROOT}/knowledge/ci.md` +- forge / rate limits → `cat ${OPS_REPO_ROOT}/knowledge/forge.md` +- Dev-agent issues → `cat ${OPS_REPO_ROOT}/knowledge/dev-agent.md` +- Review-agent issues → `cat ${OPS_REPO_ROOT}/knowledge/review-agent.md` +- Git issues → `cat ${OPS_REPO_ROOT}/knowledge/git.md` ## Credentials & API Access @@ -83,7 +83,7 @@ When you see "Dev-agent blocked: last N polls all report 'no ready issues'": File a vault procurement item so the human is notified through the vault: ```bash -cat > "${PROJECT_REPO_ROOT}/vault/pending/supervisor-$(date -u +%Y%m%d-%H%M)-issue.md" <<'VAULT_EOF' +cat > "${OPS_REPO_ROOT}/vault/pending/supervisor-$(date -u +%Y%m%d-%H%M)-issue.md" <<'VAULT_EOF' # <What is needed> ## What <description of the problem and why the supervisor cannot fix it> @@ -106,13 +106,13 @@ FIXED: <what you did> ``` or ``` -VAULT: filed vault/pending/<id>.md — <what's needed> +VAULT: filed $OPS_REPO_ROOT/vault/pending/<id>.md — <what's needed> ``` ## Learning -If you discover something new, append it to the relevant best-practices file: +If you discover something new, append it to the relevant knowledge file in the ops repo: ```bash -bash ${FACTORY_ROOT}/supervisor/update-prompt.sh "best-practices/<file>.md" "### Lesson title -Description of what you learned." +echo "### Lesson title +Description of what you learned." >> "${OPS_REPO_ROOT}/knowledge/<file>.md" ``` diff --git a/supervisor/best-practices/ci.md b/supervisor/best-practices/ci.md deleted file mode 100644 index ec46b3a..0000000 --- a/supervisor/best-practices/ci.md +++ /dev/null @@ -1,45 +0,0 @@ -# CI Best Practices - -## Environment -- Woodpecker CI at localhost:8000 (Docker backend) -- Postgres DB: use `wpdb` helper from env.sh -- Woodpecker API: use `woodpecker_api` helper from env.sh -- Example (harb): CI images pre-built at `registry.niovi.voyage/harb/*:latest` - -## Safe Fixes -- Retrigger CI (preferred, automated): Woodpecker API POST - ```bash - woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${PIPELINE_NUMBER}" -X POST - ``` - supervisor-poll.sh does this automatically for infra failures (max 2 retries). -- Retrigger CI (manual fallback): push empty commit to PR branch - ```bash - cd /tmp/${PROJECT_NAME}-worktree-<issue> && git commit --allow-empty -m "ci: retrigger" --no-verify && git push origin <branch> --force - ``` -- Restart woodpecker-agent: `sudo systemctl restart woodpecker-agent` -- View pipeline status: `wpdb -c "SELECT number, status FROM pipelines WHERE repo_id=$WOODPECKER_REPO_ID ORDER BY number DESC LIMIT 5;"` -- View failed steps: `bash ${FACTORY_ROOT}/lib/ci-debug.sh failures <pipeline-number>` -- View step logs: `bash ${FACTORY_ROOT}/lib/ci-debug.sh logs <pipeline-number> <step-name>` - -## Dangerous (escalate) -- Restarting woodpecker-server (drops all running pipelines) -- Modifying pipeline configs in `.woodpecker/` directory - -## Known Issues -- forge rate-limits SSH clones. `git` step fails with exit 128. Retrigger usually works. -- `log_entries` table grows fast (was 5.6GB once). Truncate periodically. -- Example (harb): Running CI + harb stack = 14+ containers on 8GB. Memory pressure is real. -- CI images take hours to rebuild. Never run `docker system prune -a`. - -## Lessons Learned -- Exit code 128 on git step = forge rate limit, not a code problem. Retrigger. -- Exit code 137 = OOM kill. Check memory, kill stale processes, retrigger. -- `node-quality` step fails on eslint/typescript errors — these need code fixes, not CI fixes. - -### Example (harb): FEE_DEST address must match DeployLocal.sol -When DeployLocal.sol changes the feeDest address, bootstrap-common.sh must also be updated. -Current feeDest = keccak256('harb.local.feeDest') = 0x8A9145E1Ea4C4d7FB08cF1011c8ac1F0e10F9383. -Symptom: bootstrap step exits 1 after 'Granting recenter access to deployer' with no error — setRecenterAccess reverts because wrong address is impersonated. - -### Example (harb): keccak-derived FEE_DEST requires anvil_setBalance before impersonation -When FEE_DEST is a keccak-derived address (e.g. keccak256('harb.local.feeDest')), it has zero ETH balance. Any function that calls `anvil_impersonateAccount` then `cast send --from $FEE_DEST --unlocked` will fail silently (output redirected to LOG_FILE) but exit 1 due to gas deduction failure. Fix: add `cast rpc anvil_setBalance "$FEE_DEST" "0xDE0B6B3A7640000"` before impersonation. Applied in both bootstrap-common.sh and red-team.sh. diff --git a/supervisor/best-practices/dev-agent.md b/supervisor/best-practices/dev-agent.md deleted file mode 100644 index 8850df5..0000000 --- a/supervisor/best-practices/dev-agent.md +++ /dev/null @@ -1,93 +0,0 @@ -# Dev-Agent Best Practices - -## Architecture -- `dev-poll.sh` (cron */10) → finds ready backlog issues → spawns `dev-agent.sh` -- `dev-agent.sh` uses `claude -p` for implementation, runs in git worktree -- Lock file: `/tmp/dev-agent.lock` (contains PID) -- Status file: `/tmp/dev-agent-status` -- Worktrees: `/tmp/${PROJECT_NAME}-worktree-<issue-number>/` - -## Safe Fixes -- Remove stale lock: `rm -f /tmp/dev-agent.lock` (only if PID is dead) -- Kill stuck agent: `kill <pid>` then clean lock -- Restart on derailed PR: `bash ${FACTORY_ROOT}/dev/dev-agent.sh <issue-number> &` -- Clean worktree: `cd $PROJECT_REPO_ROOT && git worktree remove /tmp/${PROJECT_NAME}-worktree-<N> --force` -- Remove `in-progress` label if agent died without cleanup: - ```bash - forge_api DELETE "/issues/<N>/labels/in-progress" - ``` - -## Dangerous (escalate) -- Restarting agent on an issue that has an open PR with review changes — may lose context -- Anything that modifies the PR branch history -- Closing PRs or issues - -## Known Issues -- `claude -p -c` (continue) fails if session was compacted — falls back to fresh `-p` -- CI_FIX_COUNT is now reset on CI pass (fixed 2026-03-12), so each review phase gets fresh CI fix budget -- Worktree creation fails if main repo has stale rebase — auto-heals now -- Large text in jq `--arg` can break — write to file first -- `$([ "$VAR" = true ] && echo "...")` crashes under `set -euo pipefail` - -## Lessons Learned -- Agents don't have memory between tasks — full context must be in the prompt -- Prior art injection (closed PR diffs) prevents rework -- Feature issues MUST list affected e2e test files -- CI fix loop is essential — first attempt rarely works -- CLAUDE_TIMEOUT=7200 (2h) is needed for complex issues - -## Dependency Resolution - -**Trust closed state.** If a dependency issue is closed, the code is on the primary branch. Period. - -DO NOT try to find the specific PR that closed an issue. This is over-engineering that causes false negatives: -- forge shares issue/PR numbering — no guaranteed relationship -- PRs don't always mention the issue number in title/body -- Searching last N closed PRs misses older merges -- The dev-agent closes issues after merging, so closed = merged - -The only check needed: `issue.state == "closed"`. - -### False Positive: Status Unchanged Alert -The supervisor-poll alert 'status unchanged for Nmin' is a false positive for complex implementation tasks. The status is set to 'claude assessing + implementing' at the START of the `timeout 7200 claude -p ...` call and only updates after Claude finishes. Normal complex tasks (multi-file Solidity changes + forge test) take 45-90 minutes. To distinguish a false positive from a real stuck agent: check that the claude PID is alive (`ps -p <PID>`), consuming CPU (>0%), and has active threads (`pstree -p <PID>`). If the process is alive and using CPU, do NOT restart it — this wastes completed work. - -### False Positive: 'Waiting for CI + Review' Alert -The 'status unchanged for Nmin' alert is also a false positive when status is 'waiting for CI + review on PR #N (round R)'. This is an intentional sleep/poll loop — the agent is waiting for CI to pass and then for review-poll to post a review. CI can take 20–40 minutes; review follows. Do NOT restart the agent. Confirm by checking: (1) agent PID is alive, (2) CI commit status via `forge_api GET /commits/<sha>/status`, (3) review-poll log shows it will pick up the PR on next cycle. - -### False Positive: Shared Status File Causes Giant Age (29M+ min) -When the status file `/tmp/dev-agent-status` doesn't exist, `stat -c %Y` fails and the supervisor falls back to epoch 0. The computed age is then `NOW_EPOCH/60 ≈ 29,567,290 min`, which is unmistakably a false positive. -Root cause: the status file is not per-project (tracked as disinto issue #423). It can be missing if: (1) the agent has not written to it yet, (2) cleanup ran early, or (3) another project's cleanup deleted it. -Fix: confirm the agent PID is alive and the tmux session shows active work, then touch the file: `printf '[%s] dev-agent #NNN: <phase> (<project>)\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" > /tmp/dev-agent-status`. This clears the alert without restarting anything. - -### PR CI vs Push CI mismatch causes silent stall in awaiting_review -When push CI passes but PR CI fails (e.g., a duplicate-detection step only runs on pull_request events), the phase-handler transitions to PHASE:awaiting_review without detecting the PR CI failure. The agent then sleeps in the review-poll loop indefinitely. - -Symptom: PR CI=failure but dev-agent phase=awaiting_review, status shows 'waiting for CI + review'. - -Fix: inject the CI failure info into the Claude session with agent_inject_into_session, pointing to the duplicate blocks and telling Claude to fix + push + write PHASE:awaiting_ci. The phase-handler's awaiting_review loop checks for phase file mtime changes every 5 min and will re-enter the main loop automatically. - -### Push CI vs PR CI mismatch — agent picks wrong pipeline number -When the phase-handler injects 'CI failed' with a push pipeline number (e.g. #622), the agent checks that push pipeline, finds it passed, and concludes 'CI OK' — setting PHASE:awaiting_review despite the PR pipeline (#623) being the one that actually failed. -Root cause: the injected event does not always carry the correct pipeline number. -Symptom: agent in awaiting_review with PR CI=failure and push CI=success. -Fix: inject with explicit pipeline #623 (the pull_request event pipeline), point to the failing step and the specific duplicate blocks to fix. Use: woodpecker_api /repos/4/pipelines?event=pull_request (or look for event=pull_request in recent pipelines list) to find the correct pipeline number before injecting. - -### Race Condition: Review Posted Before PHASE:awaiting_review Transitions -**Symptom:** Dev-agent status unchanged at 'waiting for review on PR #N', no `review-injected-disinto-N` sentinel, but a formal review already exists on forge and `/tmp/disinto-review-output-N.json` was written before the phase file updated. - -**Root cause:** review-pr.sh runs while the dev-agent is still in PHASE:awaiting_ci. inject_review_into_dev_session returns early (phase check fails). On subsequent review-poll cycles, the PR is skipped (formal review already exists for SHA), so inject is never called again. - -**Fix:** Manually inject the review: -```bash -source /home/debian/dark-factory/lib/env.sh -PROJECT_TOML=/home/debian/dark-factory/projects/disinto.toml -source /home/debian/dark-factory/lib/load-project.sh "$PROJECT_TOML" -PHASE_FILE="/tmp/dev-session-${PROJECT_NAME}-<ISSUE>.phase" -PR_NUM=<N>; PR_BRANCH="fix/issue-<ISSUE>"; PR_SHA=$(cat /tmp/dev-session-${PROJECT_NAME}-<ISSUE>.phase | grep SHA | cut -d: -f2 || git -C $PROJECT_REPO_ROOT rev-parse origin/$PR_BRANCH) -REVIEW_TEXT=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" "${FORGE_API}/issues/${PR_NUM}/comments?limit=50" | jq -r --arg sha "$PR_SHA" '[.[] | select(.body | contains("<!-- reviewed: " + $sha))] | last // empty | .body') -INJECT_MSG="Review: REQUEST_CHANGES on PR #${PR_NUM}:\n\n${REVIEW_TEXT}\n\nInstructions:\n1. Address each piece of feedback carefully.\n2. Run lint and tests when done.\n3. Commit your changes and push: git push origin ${PR_BRANCH}\n4. Write: echo PHASE:awaiting_ci > "${PHASE_FILE}"\n5. Stop and wait for the next CI result." -INJECT_TMP=$(mktemp); printf '%s' "$INJECT_MSG" > "$INJECT_TMP" -tmux load-buffer -b inject "$INJECT_TMP" && tmux paste-buffer -t "dev-${PROJECT_NAME}-<ISSUE>" -b inject && sleep 0.5 && tmux send-keys -t "dev-${PROJECT_NAME}-<ISSUE>" '' Enter -touch "/tmp/review-injected-${PROJECT_NAME}-${PR_NUM}" -``` -Then update /tmp/dev-agent-status to reflect current work. diff --git a/supervisor/best-practices/disk.md b/supervisor/best-practices/disk.md deleted file mode 100644 index 2291cf2..0000000 --- a/supervisor/best-practices/disk.md +++ /dev/null @@ -1,24 +0,0 @@ -# Disk Best Practices - -## Safe Fixes -- Docker cleanup: `sudo docker system prune -f` (keeps images, removes stopped containers + dangling layers) -- Truncate supervisor logs >5MB: `truncate -s 0 <file>` -- Remove stale worktrees: check `/tmp/${PROJECT_NAME}-worktree-*`, only if dev-agent not running on them -- Woodpecker log_entries: `DELETE FROM log_entries WHERE id < (SELECT max(id) - 100000 FROM log_entries);` then `VACUUM;` -- Node module caches in worktrees: `rm -rf /tmp/${PROJECT_NAME}-worktree-*/node_modules/` -- Git garbage collection: `cd $PROJECT_REPO_ROOT && git gc --prune=now` - -## Dangerous (escalate) -- `docker system prune -a --volumes` — deletes ALL images including CI build cache -- Deleting anything in `$PROJECT_REPO_ROOT/` that's tracked by git -- Truncating Woodpecker DB tables other than log_entries - -## Known Disk Hogs -- Woodpecker `log_entries` table: grows to 5GB+. Truncate periodically. -- Docker overlay layers: survive normal prune. `-a` variant kills everything. -- Git worktrees in /tmp: accumulate node_modules, build artifacts -- Forge cache in `~/.foundry/cache/`: can grow large with many compilations - -## Lessons Learned -- After truncating log_entries, run VACUUM FULL (reclaims actual disk space) -- Docker ghost overlay layers need `prune -a` but that kills CI images — only do this if truly desperate diff --git a/supervisor/best-practices/forge.md b/supervisor/best-practices/forge.md deleted file mode 100644 index e0ea130..0000000 --- a/supervisor/best-practices/forge.md +++ /dev/null @@ -1,36 +0,0 @@ -# Forge Best Practices - -## Rate Limiting -The forge (Forgejo/Gitea) may rate-limit SSH and HTTPS clones. Symptoms: -- Woodpecker `git` step fails with exit code 128 -- Multiple pipelines fail in quick succession with the same error -- Retriggers make it WORSE by adding more clone attempts - -### What To Do -- **Do NOT retrigger** during a rate-limit storm. Wait 10-15 minutes. -- Check if multiple pipelines failed on `git` step recently: - ```bash - wpdb -c "SELECT number, status, to_timestamp(started) FROM pipelines WHERE repo_id=$WOODPECKER_REPO_ID AND status='failure' ORDER BY number DESC LIMIT 5;" - wpdb -c "SELECT s.name, s.exit_code FROM steps s JOIN pipelines p ON s.pipeline_id=p.id WHERE p.number=<N> AND p.repo_id=$WOODPECKER_REPO_ID AND s.state='failure';" - ``` -- If multiple `git` failures with exit 128 in the last 15 min → it's rate limiting. Wait. -- Only retrigger after 15+ minutes of no CI activity. - -### How To Retrigger Safely -```bash -cd <worktree> && git commit --allow-empty -m "ci: retrigger" --no-verify && git push origin <branch> --force -``` - -### Prevention -- The system runs 3 agents staggered by 3 minutes. During heavy development, many PRs trigger CI simultaneously. -- One pipeline at a time is ideal on this VPS (resource + rate limit reasons). -- If >3 pipelines are pending/running, do NOT create more work. - -## API Tokens -- API token is in `.env` as `FORGE_TOKEN` — loaded via env.sh. -- Review bot has a separate token (`$FORGE_REVIEW_TOKEN`) for formal reviews. -- With local Forgejo, tokens don't expire. For remote forges, check provider docs. - -## Lessons Learned -- Retrigger storm on 2026-03-12: supervisor + dev-agent both retriggered during rate limit, caused 5+ failed pipelines. Added cooldown awareness. -- Empty commit retrigger works but adds noise to git history. Acceptable tradeoff. diff --git a/supervisor/best-practices/git.md b/supervisor/best-practices/git.md deleted file mode 100644 index 6551d3a..0000000 --- a/supervisor/best-practices/git.md +++ /dev/null @@ -1,61 +0,0 @@ -# Git Best Practices - -## Environment -- Repo: `$PROJECT_REPO_ROOT`, remote: `$PROJECT_REMOTE` -- Branch: `$PRIMARY_BRANCH` (protected — no direct push, PRs only) -- Worktrees: `/tmp/${PROJECT_NAME}-worktree-<issue>/` - -## Safe Fixes -- Abort stale rebase: `cd $PROJECT_REPO_ROOT && git rebase --abort` -- Switch to $PRIMARY_BRANCH: `git checkout $PRIMARY_BRANCH` -- Prune worktrees: `git worktree prune` -- Reset dirty state: `git checkout -- .` (only uncommitted changes) -- Fetch latest: `git fetch origin $PRIMARY_BRANCH` - -## Auto-fixable by Supervisor -- **Merge conflict on approved PR**: rebase onto $PRIMARY_BRANCH and force-push - ```bash - cd /tmp/${PROJECT_NAME}-worktree-<issue> || git worktree add /tmp/${PROJECT_NAME}-worktree-<issue> <branch> - cd /tmp/${PROJECT_NAME}-worktree-<issue> - git fetch origin $PRIMARY_BRANCH - git rebase origin/$PRIMARY_BRANCH - # If conflict is trivial (NatSpec, comments): resolve and continue - # If conflict is code logic: escalate to Clawy - git push origin <branch> --force - ``` -- **Stale rebase**: `git rebase --abort && git checkout $PRIMARY_BRANCH` -- **Wrong branch**: `git checkout $PRIMARY_BRANCH` - -## Dangerous (escalate) -- `git reset --hard` on any branch with unpushed work -- Deleting remote branches -- Force-pushing to any branch -- Anything on the $PRIMARY_BRANCH branch directly - -## Known Issues -- Main repo MUST be on $PRIMARY_BRANCH at all times. Dev work happens in worktrees. -- Stale rebases (detached HEAD) break all worktree creation — silent pipeline stall. -- `git worktree add` fails if target directory exists (even empty). Remove first. -- Many old branches exist locally (100+). Normal — don't bulk-delete. - -## Evolution Pipeline -- The evolution pipeline (`tools/push3-evolution/evolve.sh`) temporarily modifies - `onchain/src/OptimizerV3.sol` and `onchain/src/OptimizerV3Push3.sol` during runs. -- **DO NOT revert these files while evolution is running** (check: `pgrep -f evolve.sh`). -- If `/tmp/evolution.pid` exists and the PID is alive, the dirty state is intentional. -- Evolution will restore the files when it finishes. - -## Lessons Learned -- NEVER delete remote branches before confirming merge. Close PR, rebase locally, force-push if needed. -- Stale rebase caused 5h pipeline stall once (2026-03-11). Auto-heal added to dev-agent. -- lint-staged hooks fail when `forge` not in PATH. Use `--no-verify` when committing from scripts. - -### PR #608 Post-Mortem (2026-03-12/13) -PR sat blocked for 24 hours while 21 other PRs merged. Root causes: -1. **Supervisor didn't detect merge conflicts** — only checked CI state, not `mergeable`. Fixed: now checks `mergeable=false` as first condition. -2. **Supervisor didn't detect stale REQUEST_CHANGES** — review bot requested changes, dev-agent never came back to fix them, moved on to other issues. Need: detect "PR has REQUEST_CHANGES older than N hours with no new push." -3. **No staleness kill switch** — after N merge conflicts or N days, a PR should be auto-closed and the issue reopened for a fresh attempt. Rebasing across 21 commits is more work than starting over. - -**Rules derived:** -- Supervisor should close PRs that are >24h old with merge conflicts and no recent activity. Reopen the parent issue with a note pointing to the closed PR as prior art. -- Dev-agent must not abandon a PR with REQUEST_CHANGES — either fix or close it before moving to new work. diff --git a/supervisor/best-practices/memory.md b/supervisor/best-practices/memory.md deleted file mode 100644 index fb6c3ce..0000000 --- a/supervisor/best-practices/memory.md +++ /dev/null @@ -1,29 +0,0 @@ -# Memory Best Practices - -## Environment -- VPS: 8GB RAM, 4GB swap, Debian -- Running: Docker stack (8 containers), Woodpecker CI, OpenClaw gateway - -## Safe Fixes (no permission needed) -- Kill stale `claude` processes (>3h old): `pgrep -f "claude" --older 10800 | xargs kill` -- Drop filesystem caches: `sync && echo 3 | sudo tee /proc/sys/vm/drop_caches` -- Restart bloated Anvil: `sudo docker restart ${PROJECT_NAME}-anvil-1` (grows to 12GB+ over hours) -- Kill orphan node processes from dead worktrees - -## Dangerous (escalate) -- `docker system prune -a --volumes` — kills CI images, hours to rebuild -- Stopping project stack containers — breaks dev environment -- OOM that survives all safe fixes — needs human decision on what to kill - -## Known Memory Hogs -- `claude` processes from dev-agent: 200MB+ each, can zombie -- `dockerd`: 600MB+ baseline (normal) -- `openclaw-gateway`: 500MB+ (normal) -- Anvil container: starts small, grows unbounded over hours -- `forge build` with via_ir: can spike to 4GB+. Use `--skip test script` to reduce. -- Vite dev servers inside containers: 150MB+ each - -## Lessons Learned -- After killing processes, always `sync && echo 3 | sudo tee /proc/sys/vm/drop_caches` -- Swap doesn't drain from dropping caches alone — it's actual paged-out process memory -- Running CI + full project stack = 14+ containers on 8GB. Only one pipeline at a time. diff --git a/supervisor/best-practices/review-agent.md b/supervisor/best-practices/review-agent.md deleted file mode 100644 index 53865e7..0000000 --- a/supervisor/best-practices/review-agent.md +++ /dev/null @@ -1,30 +0,0 @@ -# Review Agent Best Practices - -## Architecture -- `review-poll.sh` (cron */10) → finds open PRs with CI pass + no review → spawns `review-pr.sh` -- `review-pr.sh` uses `claude -p` to review the diff, posts structured comment -- Uses `review_bot` forge account for formal reviews (separate from main account) -- Skips WIP/draft PRs (`[WIP]` in title or draft flag) - -## Safe Fixes -- Manually trigger review: `bash ${FACTORY_ROOT}/review/review-pr.sh <pr-number>` -- Force re-review: `bash ${FACTORY_ROOT}/review/review-pr.sh <pr-number> --force` -- Check review log: `tail -20 ${FACTORY_ROOT}/review/review.log` - -## Common Failures -- **"SKIP: CI=failure"** — review bot won't review until CI passes. Fix CI first. -- **"already reviewed"** — bot checks `<!-- reviewed: SHA -->` comment marker. Use `--force` to override. -- **Review error comment** — uses `<!-- review-error: SHA -->` marker, does NOT count as reviewed. Bot should retry automatically. -- **Self-narration collapse** — bot sometimes narrates instead of producing structured JSON. JSON output format in the prompt prevents this. -- **Hallucinated findings** — bot may flag non-issues. This needs Clawy's judgment — escalate. - -## Monitoring -- Unreviewed PRs with CI pass for >1h → supervisor-poll.sh auto-triggers review -- Review errors should resolve on next poll cycle -- If same PR fails review 3+ times → likely a prompt issue, escalate - -## Lessons Learned -- Review bot must output JSON — prevents self-narration collapse -- DISCUSS verdict should be treated same as REQUEST_CHANGES by dev-agent -- Error comments must NOT include `<!-- reviewed: SHA -->` — would falsely mark as reviewed -- Review bot uses forge formal reviews API — branch protection requires different user than PR author diff --git a/supervisor/journal/.gitkeep b/supervisor/journal/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/supervisor/preflight.sh b/supervisor/preflight.sh index af4e5d4..ba740b7 100755 --- a/supervisor/preflight.sh +++ b/supervisor/preflight.sh @@ -218,7 +218,7 @@ echo "" echo "## Pending Vault Items" _found_vault=false -for _vf in "${PROJECT_REPO_ROOT}/vault/pending/"*.md; do +for _vf in "${OPS_REPO_ROOT}/vault/pending/"*.md; do [ -f "$_vf" ] || continue _found_vault=true _vtitle=$(grep -m1 '^# ' "$_vf" | sed 's/^# //' || basename "$_vf") diff --git a/vault/AGENTS.md b/vault/AGENTS.md index 2cc3fa7..5b010ec 100644 --- a/vault/AGENTS.md +++ b/vault/AGENTS.md @@ -10,11 +10,11 @@ to a human by writing `PHASE:escalate` to a phase file — using the same unified escalation path as dev/action agents. **Pipeline B — Procurement (*.md)**: The planner files resource requests as -markdown files in `vault/pending/`. `vault-poll.sh` notifies the human via +markdown files in `$OPS_REPO_ROOT/vault/pending/`. `vault-poll.sh` notifies the human via vault/forge. The human fulfills the request (creates accounts, provisions infra, -adds secrets to `.env`) and moves the file to `vault/approved/`. +adds secrets to `.env`) and moves the file to `$OPS_REPO_ROOT/vault/approved/`. `vault-fire.sh` then extracts the proposed entry and appends it to -`RESOURCES.md`. +`$OPS_REPO_ROOT/RESOURCES.md`. **Pipeline C — Rent-a-Human (outreach drafts)**: Any agent can dispatch the `run-rent-a-human` formula (via an `action` issue) when a task requires a human @@ -30,15 +30,15 @@ needed — the human reviews and publishes directly. - `vault/vault-agent.sh` — Classifies and routes pending JSON actions via `claude -p`: auto-approve, auto-reject, or escalate to human - `vault/vault-env.sh` — Shared env setup for vault sub-scripts: sources `lib/env.sh`, overrides `FORGE_TOKEN` with `FORGE_VAULT_TOKEN`, sets `VAULT_TOKEN` for vault-runner container - `vault/PROMPT.md` — System prompt for the vault agent's Claude invocation -- `vault/vault-fire.sh` — Executes an approved action (JSON) in an **ephemeral Docker container** with vault-only secrets injected (GITHUB_TOKEN, CLAWHUB_TOKEN — never exposed to agents). For deployment actions, calls `lib/ci-helpers.sh:ci_promote()` to gate production promotes via Woodpecker environments. Writes RESOURCES.md entry for procurement MD approvals. +- `vault/vault-fire.sh` — Executes an approved action (JSON) in an **ephemeral Docker container** with vault-only secrets injected (GITHUB_TOKEN, CLAWHUB_TOKEN — never exposed to agents). For deployment actions, calls `lib/ci-helpers.sh:ci_promote()` to gate production promotes via Woodpecker environments. Writes `$OPS_REPO_ROOT/RESOURCES.md` entry for procurement MD approvals. - `vault/vault-reject.sh` — Marks a JSON action as rejected - `formulas/run-rent-a-human.toml` — Formula for human-action drafts: Claude researches target platform norms, drafts copy-paste content, writes to `vault/outreach/{platform}/drafts/`, notifies human via vault/forge -**Procurement flow**: -1. Planner drops `vault/pending/<name>.md` with what/why/proposed RESOURCES.md entry +**Procurement flow** (all vault items live in `$OPS_REPO_ROOT/vault/`): +1. Planner drops `$OPS_REPO_ROOT/vault/pending/<name>.md` with what/why/proposed RESOURCES.md entry 2. `vault-poll.sh` notifies human via vault/forge -3. Human fulfills: creates account, adds secrets to `.env`, moves file to `vault/approved/` -4. `vault-fire.sh` extracts proposed entry, appends to RESOURCES.md, moves to `vault/fired/` +3. Human fulfills: creates account, adds secrets to `.env`, moves file to `approved/` +4. `vault-fire.sh` extracts proposed entry, appends to `$OPS_REPO_ROOT/RESOURCES.md`, moves to `fired/` 5. Next planner run reads RESOURCES.md → new capability available → unblocks prerequisite tree **Environment variables consumed**: diff --git a/vault/PROMPT.md b/vault/PROMPT.md index 85dc669..3f93ee5 100644 --- a/vault/PROMPT.md +++ b/vault/PROMPT.md @@ -1,7 +1,7 @@ # Vault Agent You are the vault agent for `$FORGE_REPO`. You were called by -`vault-poll.sh` because one or more actions in `vault/pending/` need +`vault-poll.sh` because one or more actions in `$OPS_REPO_ROOT/vault/pending/` need classification and routing. ## Two Pipelines @@ -16,7 +16,7 @@ You classify and route these: auto-approve, escalate, or reject. Resource requests from the planner. These always escalate to the human — you do NOT auto-approve or reject procurement requests. The human fulfills the request (creates accounts, provisions infra, adds secrets to .env) -and moves the file from `vault/pending/` to `vault/approved/`. +and moves the file from `$OPS_REPO_ROOT/vault/pending/` to `$OPS_REPO_ROOT/vault/approved/`. `vault-fire.sh` then writes the RESOURCES.md entry. ## Your Job (Action Gating only) @@ -116,7 +116,7 @@ ROUTE: <action-id> → <auto-approve|escalate|reject> — <reason> - Process ALL pending JSON actions in the batch. Never skip silently. - For auto-approved actions, fire them immediately via `vault-fire.sh`. -- For escalated actions, move to `vault/approved/` only AFTER human approval. +- For escalated actions, move to `$OPS_REPO_ROOT/vault/approved/` only AFTER human approval. - Read the action JSON carefully. Check the payload, not just the metadata. - Ignore `.md` files in pending/ — those are procurement requests handled separately by vault-poll.sh and the human. diff --git a/vault/approved/.gitkeep b/vault/approved/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/vault/fired/.gitkeep b/vault/fired/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/vault/outreach/.gitkeep b/vault/outreach/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/vault/pending/.gitkeep b/vault/pending/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/vault/rejected/.gitkeep b/vault/rejected/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/vault/vault-agent.sh b/vault/vault-agent.sh index 1bda3b9..4436982 100755 --- a/vault/vault-agent.sh +++ b/vault/vault-agent.sh @@ -13,9 +13,10 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" source "${SCRIPT_DIR}/vault-env.sh" -VAULT_DIR="${FACTORY_ROOT}/vault" -PROMPT_FILE="${VAULT_DIR}/PROMPT.md" -LOGFILE="${VAULT_DIR}/vault.log" +VAULT_SCRIPT_DIR="${FACTORY_ROOT}/vault" +OPS_VAULT_DIR="${OPS_REPO_ROOT}/vault" +PROMPT_FILE="${VAULT_SCRIPT_DIR}/PROMPT.md" +LOGFILE="${VAULT_SCRIPT_DIR}/vault.log" CLAUDE_TIMEOUT="${CLAUDE_TIMEOUT:-3600}" log() { @@ -26,7 +27,7 @@ log() { ACTIONS_BATCH="" ACTION_COUNT=0 -for action_file in "${VAULT_DIR}/pending/"*.json; do +for action_file in "${OPS_VAULT_DIR}/pending/"*.json; do [ -f "$action_file" ] || continue ACTION_STATUS=$(jq -r '.status // ""' < "$action_file" 2>/dev/null) @@ -36,7 +37,7 @@ for action_file in "${VAULT_DIR}/pending/"*.json; do if ! jq empty < "$action_file" 2>/dev/null; then ACTION_ID=$(basename "$action_file" .json) log "malformed JSON: $action_file — rejecting" - bash "${VAULT_DIR}/vault-reject.sh" "$ACTION_ID" "malformed JSON" 2>/dev/null || true + bash "${VAULT_SCRIPT_DIR}/vault-reject.sh" "$ACTION_ID" "malformed JSON" 2>/dev/null || true continue fi @@ -66,9 +67,10 @@ ${ACTIONS_BATCH} ## Environment - FACTORY_ROOT=${FACTORY_ROOT} -- Vault directory: ${VAULT_DIR} -- vault-fire.sh: bash ${VAULT_DIR}/vault-fire.sh <action-id> -- vault-reject.sh: bash ${VAULT_DIR}/vault-reject.sh <action-id> \"<reason>\" +- OPS_REPO_ROOT=${OPS_REPO_ROOT} +- Vault data: ${OPS_VAULT_DIR} +- vault-fire.sh: bash ${VAULT_SCRIPT_DIR}/vault-fire.sh <action-id> +- vault-reject.sh: bash ${VAULT_SCRIPT_DIR}/vault-reject.sh <action-id> \"<reason>\" Process each action now. For auto-approve, fire immediately. For reject, call vault-reject.sh. diff --git a/vault/vault-fire.sh b/vault/vault-fire.sh index 6388b2e..229825b 100755 --- a/vault/vault-fire.sh +++ b/vault/vault-fire.sh @@ -17,10 +17,10 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" source "${SCRIPT_DIR}/vault-env.sh" -VAULT_DIR="${FACTORY_ROOT}/vault" -LOCKS_DIR="${VAULT_DIR}/.locks" -LOGFILE="${VAULT_DIR}/vault.log" -RESOURCES_FILE="${PROJECT_REPO_ROOT:-${FACTORY_ROOT}}/RESOURCES.md" +OPS_VAULT_DIR="${OPS_REPO_ROOT}/vault" +LOCKS_DIR="${FACTORY_ROOT}/vault/.locks" +LOGFILE="${FACTORY_ROOT}/vault/vault.log" +RESOURCES_FILE="${OPS_REPO_ROOT}/RESOURCES.md" log() { printf '[%s] vault-fire: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE" @@ -34,19 +34,19 @@ ACTION_ID="${1:?Usage: vault-fire.sh <item-id>}" IS_PROCUREMENT=false ACTION_FILE="" -if [ -f "${VAULT_DIR}/approved/${ACTION_ID}.md" ]; then +if [ -f "${OPS_VAULT_DIR}/approved/${ACTION_ID}.md" ]; then IS_PROCUREMENT=true - ACTION_FILE="${VAULT_DIR}/approved/${ACTION_ID}.md" -elif [ -f "${VAULT_DIR}/pending/${ACTION_ID}.md" ]; then + ACTION_FILE="${OPS_VAULT_DIR}/approved/${ACTION_ID}.md" +elif [ -f "${OPS_VAULT_DIR}/pending/${ACTION_ID}.md" ]; then IS_PROCUREMENT=true - mv "${VAULT_DIR}/pending/${ACTION_ID}.md" "${VAULT_DIR}/approved/${ACTION_ID}.md" - ACTION_FILE="${VAULT_DIR}/approved/${ACTION_ID}.md" + mv "${OPS_VAULT_DIR}/pending/${ACTION_ID}.md" "${OPS_VAULT_DIR}/approved/${ACTION_ID}.md" + ACTION_FILE="${OPS_VAULT_DIR}/approved/${ACTION_ID}.md" log "$ACTION_ID: pending → approved (procurement)" -elif [ -f "${VAULT_DIR}/approved/${ACTION_ID}.json" ]; then - ACTION_FILE="${VAULT_DIR}/approved/${ACTION_ID}.json" -elif [ -f "${VAULT_DIR}/pending/${ACTION_ID}.json" ]; then - mv "${VAULT_DIR}/pending/${ACTION_ID}.json" "${VAULT_DIR}/approved/${ACTION_ID}.json" - ACTION_FILE="${VAULT_DIR}/approved/${ACTION_ID}.json" +elif [ -f "${OPS_VAULT_DIR}/approved/${ACTION_ID}.json" ]; then + ACTION_FILE="${OPS_VAULT_DIR}/approved/${ACTION_ID}.json" +elif [ -f "${OPS_VAULT_DIR}/pending/${ACTION_ID}.json" ]; then + mv "${OPS_VAULT_DIR}/pending/${ACTION_ID}.json" "${OPS_VAULT_DIR}/approved/${ACTION_ID}.json" + ACTION_FILE="${OPS_VAULT_DIR}/approved/${ACTION_ID}.json" TMP=$(mktemp) jq '.status = "approved"' "$ACTION_FILE" > "$TMP" && mv "$TMP" "$ACTION_FILE" log "$ACTION_ID: pending → approved" @@ -93,7 +93,7 @@ if [ "$IS_PROCUREMENT" = true ]; then log "$ACTION_ID: wrote RESOURCES.md entry" # Move to fired/ - mv "$ACTION_FILE" "${VAULT_DIR}/fired/${ACTION_ID}.md" + mv "$ACTION_FILE" "${OPS_VAULT_DIR}/fired/${ACTION_ID}.md" rm -f "${LOCKS_DIR}/${ACTION_ID}.notified" log "$ACTION_ID: approved → fired (procurement)" exit 0 @@ -122,7 +122,7 @@ if [ -f "${FACTORY_ROOT}/.env.vault.enc" ] && [ -f "${FACTORY_ROOT}/docker-compo else # Fallback for bare-metal or pre-migration setups: run action handler directly log "$ACTION_ID: no .env.vault.enc or docker-compose.yml — running action directly" - bash "${VAULT_DIR}/vault-run-action.sh" "$ACTION_ID" >> "$LOGFILE" 2>&1 || FIRE_EXIT=$? + bash "${SCRIPT_DIR}/vault-run-action.sh" "$ACTION_ID" >> "$LOGFILE" 2>&1 || FIRE_EXIT=$? fi # ============================================================================= @@ -132,7 +132,7 @@ if [ "$FIRE_EXIT" -eq 0 ]; then # Update with fired timestamp and move to fired/ TMP=$(mktemp) jq --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" '.status = "fired" | .fired_at = $ts' "$ACTION_FILE" > "$TMP" \ - && mv "$TMP" "${VAULT_DIR}/fired/${ACTION_ID}.json" + && mv "$TMP" "${OPS_VAULT_DIR}/fired/${ACTION_ID}.json" rm -f "$ACTION_FILE" log "$ACTION_ID: approved → fired" else diff --git a/vault/vault-poll.sh b/vault/vault-poll.sh index 288c76b..ace8984 100755 --- a/vault/vault-poll.sh +++ b/vault/vault-poll.sh @@ -26,8 +26,9 @@ FORGE_TOKEN="${FORGE_VAULT_TOKEN:-${FORGE_TOKEN}}" LOGFILE="${FACTORY_ROOT}/vault/vault.log" STATUSFILE="/tmp/vault-status" LOCKFILE="/tmp/vault-poll.lock" -VAULT_DIR="${FACTORY_ROOT}/vault" -LOCKS_DIR="${VAULT_DIR}/.locks" +VAULT_SCRIPT_DIR="${FACTORY_ROOT}/vault" +OPS_VAULT_DIR="${OPS_REPO_ROOT}/vault" +LOCKS_DIR="${VAULT_SCRIPT_DIR}/.locks" TIMEOUT_HOURS=48 @@ -78,7 +79,7 @@ unlock_action() { # ============================================================================= status "phase 1: retrying approved items" -for action_file in "${VAULT_DIR}/approved/"*.json; do +for action_file in "${OPS_VAULT_DIR}/approved/"*.json; do [ -f "$action_file" ] || continue ACTION_ID=$(jq -r '.id // ""' < "$action_file" 2>/dev/null) [ -z "$ACTION_ID" ] && continue @@ -89,7 +90,7 @@ for action_file in "${VAULT_DIR}/approved/"*.json; do fi log "retrying approved action: $ACTION_ID" - if bash "${VAULT_DIR}/vault-fire.sh" "$ACTION_ID" >> "$LOGFILE" 2>&1; then + if bash "${VAULT_SCRIPT_DIR}/vault-fire.sh" "$ACTION_ID" >> "$LOGFILE" 2>&1; then log "fired $ACTION_ID (retry)" else log "ERROR: fire failed for $ACTION_ID (retry)" @@ -99,7 +100,7 @@ for action_file in "${VAULT_DIR}/approved/"*.json; do done # Retry approved procurement requests (.md) -for req_file in "${VAULT_DIR}/approved/"*.md; do +for req_file in "${OPS_VAULT_DIR}/approved/"*.md; do [ -f "$req_file" ] || continue REQ_ID=$(basename "$req_file" .md) @@ -109,7 +110,7 @@ for req_file in "${VAULT_DIR}/approved/"*.md; do fi log "retrying approved procurement: $REQ_ID" - if bash "${VAULT_DIR}/vault-fire.sh" "$REQ_ID" >> "$LOGFILE" 2>&1; then + if bash "${VAULT_SCRIPT_DIR}/vault-fire.sh" "$REQ_ID" >> "$LOGFILE" 2>&1; then log "fired procurement $REQ_ID (retry)" else log "ERROR: fire failed for procurement $REQ_ID (retry)" @@ -126,7 +127,7 @@ status "phase 2: checking escalation timeouts" NOW_EPOCH=$(date +%s) TIMEOUT_SECS=$((TIMEOUT_HOURS * 3600)) -for action_file in "${VAULT_DIR}/pending/"*.json; do +for action_file in "${OPS_VAULT_DIR}/pending/"*.json; do [ -f "$action_file" ] || continue ACTION_STATUS=$(jq -r '.status // ""' < "$action_file" 2>/dev/null) @@ -142,7 +143,7 @@ for action_file in "${VAULT_DIR}/pending/"*.json; do if [ "$AGE_SECS" -gt "$TIMEOUT_SECS" ]; then AGE_HOURS=$((AGE_SECS / 3600)) log "timeout: $ACTION_ID escalated ${AGE_HOURS}h ago with no reply — auto-rejecting" - bash "${VAULT_DIR}/vault-reject.sh" "$ACTION_ID" "timeout (${AGE_HOURS}h, no human reply)" >> "$LOGFILE" 2>&1 || true + bash "${VAULT_SCRIPT_DIR}/vault-reject.sh" "$ACTION_ID" "timeout (${AGE_HOURS}h, no human reply)" >> "$LOGFILE" 2>&1 || true fi done @@ -154,7 +155,7 @@ status "phase 3: processing pending actions" PENDING_COUNT=0 PENDING_SUMMARY="" -for action_file in "${VAULT_DIR}/pending/"*.json; do +for action_file in "${OPS_VAULT_DIR}/pending/"*.json; do [ -f "$action_file" ] || continue ACTION_STATUS=$(jq -r '.status // ""' < "$action_file" 2>/dev/null) @@ -181,7 +182,7 @@ if [ "$PENDING_COUNT" -gt 0 ]; then log "found $PENDING_COUNT pending action(s), invoking vault-agent" status "invoking vault-agent for $PENDING_COUNT action(s)" - bash "${VAULT_DIR}/vault-agent.sh" >> "$LOGFILE" 2>&1 || { + bash "${VAULT_SCRIPT_DIR}/vault-agent.sh" >> "$LOGFILE" 2>&1 || { log "ERROR: vault-agent failed" } fi @@ -193,12 +194,12 @@ status "phase 4: processing pending procurement requests" PROCURE_COUNT=0 -for req_file in "${VAULT_DIR}/pending/"*.md; do +for req_file in "${OPS_VAULT_DIR}/pending/"*.md; do [ -f "$req_file" ] || continue REQ_ID=$(basename "$req_file" .md) # Check if already notified (marker file) - if [ -f "${VAULT_DIR}/.locks/${REQ_ID}.notified" ]; then + if [ -f "${LOCKS_DIR}/${REQ_ID}.notified" ]; then continue fi @@ -215,8 +216,8 @@ for req_file in "${VAULT_DIR}/pending/"*.md; do log "new procurement request: $REQ_ID — $REQ_TITLE" # Mark as notified so we don't re-send - mkdir -p "${VAULT_DIR}/.locks" - touch "${VAULT_DIR}/.locks/${REQ_ID}.notified" + mkdir -p "${LOCKS_DIR}" + touch "${LOCKS_DIR}/${REQ_ID}.notified" unlock_action "$REQ_ID" done @@ -239,7 +240,7 @@ if [ -n "${FORGE_REPO:-}" ] && [ -n "${FORGE_TOKEN:-}" ]; then ISSUE_NUM=$(printf '%s' "$ACTION_ISSUES" | jq -r ".[$idx].number") # Skip if already processed - if [ -f "${VAULT_DIR}/.locks/issue-${ISSUE_NUM}.vault-fired" ]; then + if [ -f "${LOCKS_DIR}/issue-${ISSUE_NUM}.vault-fired" ]; then continue fi @@ -272,21 +273,21 @@ if [ -n "${FORGE_REPO:-}" ] && [ -n "${FORGE_TOKEN:-}" ]; then fi # Skip if this action already exists in any stage - if [ -f "${VAULT_DIR}/approved/${ACTION_ID}.json" ] || \ - [ -f "${VAULT_DIR}/fired/${ACTION_ID}.json" ] || \ - [ -f "${VAULT_DIR}/rejected/${ACTION_ID}.json" ]; then + if [ -f "${OPS_VAULT_DIR}/approved/${ACTION_ID}.json" ] || \ + [ -f "${OPS_VAULT_DIR}/fired/${ACTION_ID}.json" ] || \ + [ -f "${OPS_VAULT_DIR}/rejected/${ACTION_ID}.json" ]; then continue fi log "vault-bot authorized action on issue #${ISSUE_NUM}: ${ACTION_ID}" - printf '%s' "$ACTION_JSON" | jq '.status = "approved"' > "${VAULT_DIR}/approved/${ACTION_ID}.json" + printf '%s' "$ACTION_JSON" | jq '.status = "approved"' > "${OPS_VAULT_DIR}/approved/${ACTION_ID}.json" COMMENT_COUNT=$((COMMENT_COUNT + 1)) # Fire the action - if bash "${VAULT_DIR}/vault-fire.sh" "$ACTION_ID" >> "$LOGFILE" 2>&1; then + if bash "${VAULT_SCRIPT_DIR}/vault-fire.sh" "$ACTION_ID" >> "$LOGFILE" 2>&1; then log "fired ${ACTION_ID} from issue #${ISSUE_NUM}" # Mark issue as processed - touch "${VAULT_DIR}/.locks/issue-${ISSUE_NUM}.vault-fired" + touch "${LOCKS_DIR}/issue-${ISSUE_NUM}.vault-fired" else log "ERROR: fire failed for ${ACTION_ID} from issue #${ISSUE_NUM}" fi diff --git a/vault/vault-reject.sh b/vault/vault-reject.sh index 9699753..7339604 100755 --- a/vault/vault-reject.sh +++ b/vault/vault-reject.sh @@ -8,8 +8,9 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" source "${SCRIPT_DIR}/vault-env.sh" -VAULT_DIR="${FACTORY_ROOT}/vault" -LOGFILE="${VAULT_DIR}/vault.log" +OPS_VAULT_DIR="${OPS_REPO_ROOT}/vault" +LOGFILE="${FACTORY_ROOT}/vault/vault.log" +LOCKS_DIR="${FACTORY_ROOT}/vault/.locks" log() { printf '[%s] vault-reject: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE" @@ -20,10 +21,10 @@ REASON="${2:-unspecified}" # Find the action file ACTION_FILE="" -if [ -f "${VAULT_DIR}/pending/${ACTION_ID}.json" ]; then - ACTION_FILE="${VAULT_DIR}/pending/${ACTION_ID}.json" -elif [ -f "${VAULT_DIR}/approved/${ACTION_ID}.json" ]; then - ACTION_FILE="${VAULT_DIR}/approved/${ACTION_ID}.json" +if [ -f "${OPS_VAULT_DIR}/pending/${ACTION_ID}.json" ]; then + ACTION_FILE="${OPS_VAULT_DIR}/pending/${ACTION_ID}.json" +elif [ -f "${OPS_VAULT_DIR}/approved/${ACTION_ID}.json" ]; then + ACTION_FILE="${OPS_VAULT_DIR}/approved/${ACTION_ID}.json" else log "ERROR: action $ACTION_ID not found in pending/ or approved/" exit 1 @@ -33,10 +34,10 @@ fi TMP=$(mktemp) jq --arg reason "$REASON" --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ '.status = "rejected" | .rejected_at = $ts | .reject_reason = $reason' \ - "$ACTION_FILE" > "$TMP" && mv "$TMP" "${VAULT_DIR}/rejected/${ACTION_ID}.json" + "$ACTION_FILE" > "$TMP" && mv "$TMP" "${OPS_VAULT_DIR}/rejected/${ACTION_ID}.json" rm -f "$ACTION_FILE" # Clean up lock if present -rm -f "${VAULT_DIR}/.locks/${ACTION_ID}.lock" +rm -f "${LOCKS_DIR}/${ACTION_ID}.lock" log "$ACTION_ID: rejected — $REASON" diff --git a/vault/vault-run-action.sh b/vault/vault-run-action.sh index 169af65..707f3db 100755 --- a/vault/vault-run-action.sh +++ b/vault/vault-run-action.sh @@ -12,8 +12,9 @@ set -euo pipefail -VAULT_DIR="${DISINTO_VAULT_DIR:-/home/agent/disinto/vault}" -LOGFILE="${VAULT_DIR}/vault.log" +VAULT_SCRIPT_DIR="${DISINTO_VAULT_DIR:-/home/agent/disinto/vault}" +OPS_VAULT_DIR="${DISINTO_OPS_VAULT_DIR:-${VAULT_SCRIPT_DIR}}" +LOGFILE="${VAULT_SCRIPT_DIR}/vault.log" ACTION_ID="${1:?Usage: vault-run-action.sh <action-id>}" log() { @@ -22,7 +23,7 @@ log() { } # Find action file in approved/ -ACTION_FILE="${VAULT_DIR}/approved/${ACTION_ID}.json" +ACTION_FILE="${OPS_VAULT_DIR}/approved/${ACTION_ID}.json" if [ ! -f "$ACTION_FILE" ]; then log "ERROR: action file not found: ${ACTION_FILE}" echo "ERROR: action file not found: ${ACTION_FILE}" >&2 @@ -118,7 +119,7 @@ case "$ACTION_TYPE" in ;; blog-post|social-post|email-blast|pricing-change|dns-change|stripe-charge) - HANDLER="${VAULT_DIR}/handlers/${ACTION_TYPE}.sh" + HANDLER="${VAULT_SCRIPT_DIR}/handlers/${ACTION_TYPE}.sh" if [ -x "$HANDLER" ]; then bash "$HANDLER" "$ACTION_ID" "$PAYLOAD" 2>&1 || FIRE_EXIT=$? else