Compare commits

..

No commits in common. "main" and "chore/gardener-20260421-1208" have entirely different histories.

39 changed files with 170 additions and 1063 deletions

View file

@ -4,16 +4,10 @@
# Steps:
# 1. shellcheck — lint all .sh files (warnings+errors)
# 2. duplicate-detection — report copy-pasted code blocks (fails only on new duplicates for PRs)
#
# Timeouts:
# Workflow-level default (10m) applies to all steps unless overridden.
when:
event: [push, pull_request]
# Workflow-level timeout default — propagates to all steps without their own timeout.
timeout: 10m
# Override default clone to authenticate against Forgejo using FORGE_TOKEN.
# Required because Forgejo is configured with REQUIRE_SIGN_IN, so anonymous
# git clones fail with exit code 128. FORGE_TOKEN is injected globally via
@ -22,13 +16,8 @@ clone:
git:
image: alpine/git
commands:
- |
if [ -n "${FORGE_TOKEN:-}" ]; then
AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:${FORGE_TOKEN}@|")
git clone --depth 1 "$AUTH_URL" .
else
git clone --depth 1 "$CI_REPO_CLONE_URL" .
fi
- AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|")
- git clone --depth 1 "$AUTH_URL" .
- git fetch --depth 1 origin "$CI_COMMIT_REF"
- git checkout FETCH_HEAD

View file

@ -7,15 +7,13 @@
#
# Checks:
# 1. shellcheck — syntax check on tests/smoke-edge-subpath.sh
# 2. caddyfile-routing-test — verify Caddyfile routing block shape
# 3. test-caddyfile-routing — run standalone unit test for Caddyfile structure
# 2. caddy validate — validate the Caddyfile template syntax
# 3. caddyfile-routing-test — verify Caddyfile routing block shape
# 4. test-caddyfile-routing — run standalone unit test for Caddyfile structure
#
# Triggers:
# - Pull requests that modify edge-related files
#
# Timeouts:
# Workflow-level default (10m) applies to all steps unless overridden.
#
# Environment variables (inherited from WOODPECKER_ENVIRONMENT):
# EDGE_BASE_URL — Edge proxy URL for reference (default: http://localhost)
# EDGE_TIMEOUT — Request timeout in seconds (default: 30)
@ -25,9 +23,6 @@
when:
event: pull_request
# Workflow-level timeout default — propagates to all steps without their own timeout.
timeout: 10m
steps:
# ── 1. ShellCheck on smoke script ────────────────────────────────────────
# `shellcheck` validates bash syntax, style, and common pitfalls.
@ -64,7 +59,6 @@ steps:
echo ''
echo ' # Reverse proxy to Forgejo'
echo ' handle /forge/* {'
echo ' uri strip_prefix /forge'
echo ' reverse_proxy 127.0.0.1:3000'
echo ' }'
echo ''
@ -100,7 +94,22 @@ steps:
cp edge-render/Caddyfile edge-render/Caddyfile.rendered
echo "Caddyfile rendered successfully"
# ── 3. Caddyfile routing block shape test ─────────────────────────────────
# ── 3. Caddy config validation ───────────────────────────────────────────
# `caddy validate` checks Caddyfile syntax and configuration.
# This validates the rendered Caddyfile against Caddy's parser.
# Exit codes:
# 0 — configuration is valid
# 1 — configuration has errors
- name: caddy-validate
image: alpine:3.19
commands:
- apk add --no-cache ca-certificates curl
- curl -sS -o /tmp/caddy "https://caddyserver.com/api/download?os=linux&arch=amd64"
- chmod +x /tmp/caddy
- /tmp/caddy version
- /tmp/caddy validate --config edge-render/Caddyfile.rendered --adapter caddyfile
# ── 4. Caddyfile routing block shape test ─────────────────────────────────
# Verify that the Caddyfile contains all required routing blocks:
# - /forge/ — Forgejo subpath
# - /ci/ — Woodpecker subpath
@ -181,7 +190,7 @@ steps:
exit 1
fi
# ── 4. Standalone Caddyfile routing test ─────────────────────────────────
# ── 5. Standalone Caddyfile routing test ─────────────────────────────────
# Run the standalone unit test for Caddyfile routing block validation.
# This test extracts the Caddyfile template from edge.hcl and validates
# its structure without requiring a running Caddy instance.

View file

@ -1,34 +0,0 @@
# .woodpecker/lint-ci.yml — CI pipeline config validator
#
# Runs `disinto validate lint-ci` to check all .woodpecker/*.yml files for:
# - Steps missing a timeout declaration
# - Network-fetch commands without per-command timeouts
#
# Triggers on PRs/pushes that touch any CI config or the validator itself.
when:
- event: [push, pull_request]
path:
- ".woodpecker/**"
- "bin/disinto"
# Workflow-level timeout default — propagates to all steps without their own timeout.
timeout: 5m
# Authenticated clone — same pattern as .woodpecker/ci.yml.
clone:
git:
image: alpine/git
commands:
- AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|")
- git clone --depth 1 "$AUTH_URL" .
- git fetch --depth 1 origin "$CI_COMMIT_REF"
- git checkout FETCH_HEAD
steps:
- name: lint-ci
image: alpine:3
commands:
- apk add --no-cache bash python3 py3-yaml
- bash bin/disinto validate lint-ci .
# Workflow-level timeout (10m) applies to all steps.

View file

@ -44,10 +44,6 @@
# Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 /
# vault 1.18.5). Bump there AND here together — drift = CI passing on
# syntax the runtime would reject.
#
# Timeouts:
# Workflow-level default (15m) applies to all steps unless overridden
# (vault-policy-validate needs longer for dev server startup).
# =============================================================================
when:
@ -61,9 +57,6 @@ when:
- "vault/roles.yaml"
- ".woodpecker/nomad-validate.yml"
# Workflow-level timeout default — propagates to all steps without their own timeout.
timeout: 15m
# Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is
# configured with REQUIRE_SIGN_IN, so anonymous git clones fail (exit 128).
# FORGE_TOKEN is injected globally via WOODPECKER_ENVIRONMENT.
@ -272,7 +265,7 @@ steps:
- name: vault-roles-validate
image: python:3.12-alpine
commands:
- pip install --quiet --disable-pip-version-check --default-timeout 30 pyyaml yamllint
- pip install --quiet --disable-pip-version-check pyyaml yamllint
- |
set -e
if [ ! -f vault/roles.yaml ]; then

View file

@ -4,10 +4,6 @@
# - ghcr.io/disinto/reproduce:<tag>
# - ghcr.io/disinto/edge:<tag>
#
# Timeouts:
# Workflow-level default (20m) applies to all steps unless overridden.
# Image builds can be slow for large images.
#
# Requires GHCR_TOKEN secret configured in Woodpecker with push access
# to ghcr.io/disinto.
@ -15,9 +11,6 @@ when:
event: tag
ref: refs/tags/v*
# Workflow-level timeout default — propagates to all steps without their own timeout.
timeout: 20m
clone:
git:
image: alpine/git

View file

@ -3,9 +3,6 @@
# Triggers on pull requests touching secret-adjacent paths.
# Sources lib/secret-scan.sh and scans each changed file's content.
# Exits non-zero if any potential secret is detected.
#
# Timeouts:
# Workflow-level default (5m) applies to all steps unless overridden.
when:
- event: pull_request
@ -18,9 +15,6 @@ when:
- "lib/hvault.sh"
- "lib/action-vault.sh"
# Workflow-level timeout default — propagates to all steps without their own timeout.
timeout: 5m
clone:
git:
image: alpine/git

View file

@ -8,9 +8,6 @@ when:
- "tests/**"
- ".woodpecker/smoke-init.yml"
# Workflow-level timeout default — propagates to all steps without their own timeout.
timeout: 5m
steps:
- name: smoke-init
image: python:3-alpine

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# Disinto — Agent Instructions
## What this repo is

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# Architect — Agent Instructions
## What this agent is

View file

@ -119,11 +119,6 @@ You are the architect agent for ${FORGE_REPO}. Work through the formula below.
Your role: strategic decomposition of vision issues into development sprints.
Propose sprints via PRs on the ops repo, converse with humans through PR comments.
You are READ-ONLY on the project repo — sub-issues are filed by filer-bot after sprint PR merge (#764).
DO NOT create issues, PRs, or any other resource on the project repo. Any sub-issue
specification must go only into the filer:begin/filer:end block of the sprint pitch.
If you think sub-issues should be filed, write them into the sprint file's filer:begin
block only. You do not have permission to POST to the project repo and any such call
will return 403 and fail this run.
## Project context
${CONTEXT_BLOCK}
@ -152,11 +147,6 @@ You are the architect agent for ${FORGE_REPO}. Work through the formula below.
Your role: strategic decomposition of vision issues into development sprints.
Propose sprints via PRs on the ops repo, converse with humans through PR comments.
You are READ-ONLY on the project repo — sub-issues are filed by filer-bot after sprint PR merge (#764).
DO NOT create issues, PRs, or any other resource on the project repo. Any sub-issue
specification must go only into the filer:begin/filer:end block of the sprint pitch.
If you think sub-issues should be filed, write them into the sprint file's filer:begin
block only. You do not have permission to POST to the project repo and any such call
will return 403 and fail this run.
## CURRENT STATE: Approved PR awaiting initial design questions
@ -191,11 +181,6 @@ You are the architect agent for ${FORGE_REPO}. Work through the formula below.
Your role: strategic decomposition of vision issues into development sprints.
Propose sprints via PRs on the ops repo, converse with humans through PR comments.
You are READ-ONLY on the project repo — sub-issues are filed by filer-bot after sprint PR merge (#764).
DO NOT create issues, PRs, or any other resource on the project repo. Any sub-issue
specification must go only into the filer:begin/filer:end block of the sprint pitch.
If you think sub-issues should be filed, write them into the sprint file's filer:begin
block only. You do not have permission to POST to the project repo and any such call
will return 403 and fail this run.
## CURRENT STATE: Design Q&A in progress
@ -546,11 +531,6 @@ IMPORTANT: Do NOT include design forks or questions. This is a go/no-go pitch.
The ## Sub-issues block is parsed by the filer-bot pipeline after sprint PR merge.
Each sub-issue between filer:begin/end markers becomes a Forgejo issue.
CRITICAL: You are READ-ONLY on the project repo. DO NOT create issues, PRs, or
POST to any /repos/${FORGE_REPO}/... endpoint. Sub-issues belong only inside the
filer:begin/filer:end block above. Any direct API call to the project repo will
return 403 and abort this run.
---
${pitch_context}
@ -917,27 +897,6 @@ if [ "${has_responses_to_process:-false}" = "true" ]; then
fi
fi
# ── Regression guard: detect direct issue creation by architect session ──
# Scans the architect log for any POST to the project repo's /issues endpoint.
# This is a cheap guard — if the model used its Bash tool to curl POST /issues
# on the project repo, it would appear in the log. Fails loudly on detection.
check_architect_issue_filing() {
local project_repo_path
project_repo_path="/repos/${FORGE_REPO}/issues"
if grep -q "POST.*${project_repo_path}" "$LOG_FILE" 2>/dev/null; then
log "ERROR: regression detected — architect session attempted to POST to ${project_repo_path}"
log "This violates the read-only contract established in #764."
log "The architect-bot must NOT file issues directly on the project repo."
log "Sub-issues are filed exclusively by filer-bot after sprint PR merge."
echo "FATAL: architect-bot attempted direct issue creation on project repo" >&2
exit 1
fi
}
# Run regression guard before cleanup
check_architect_issue_filing
# ── Clean up scratch files (legacy single file + per-issue files) ──────────
rm -f "$SCRATCH_FILE"
rm -f "${SCRATCH_FILE_PREFIX}"-*.md

View file

@ -13,7 +13,6 @@
# disinto run <action-id> Run action in ephemeral runner container
# disinto ci-logs <pipeline> [--step <name>] Read CI logs from Woodpecker SQLite
# disinto backup create <outfile> Export factory state for migration
# disinto validate [subcommand] Validate factory artifacts (lint-ci)
#
# Usage:
# disinto init https://github.com/user/repo
@ -65,26 +64,22 @@ Usage:
disinto release <version> Create vault PR for release (e.g., v1.2.0)
disinto hire-an-agent <agent-name> <role> [--formula <path>] [--local-model <url>] [--model <name>]
Hire a new agent (create user + .profile repo; re-run to rotate credentials)
disinto role <subcommand> Manage roles (enable/disable)
disinto agent <subcommand> Manage agent state (enable/disable)
disinto backup create <outfile> Export factory state (issues + ops bundle)
disinto edge <verb> [options] Manage edge tunnel registrations
disinto backup <subcommand> Backup and restore factory state
disinto validate <subcommand> Validate factory artifacts
Validate subcommands:
lint-ci Lint .woodpecker/*.yml for missing timeouts
Edge subcommands:
register [project] Register a new tunnel (generates keypair if needed)
deregister <project> Remove a tunnel registration
status Show registered tunnels
Role subcommands:
disable <role> Remove state file to disable role
enable <role> Create state file to enable role
disable --all Disable all roles
enable --all Enable all roles
status Show which roles are enabled/disabled
Agent subcommands:
disable <agent> Remove state file to disable agent
enable <agent> Create state file to enable agent
disable --all Disable all agents
enable --all Enable all agents
status Show which agents are enabled/disabled
Init options:
--branch <name> Primary branch (default: auto-detect)
@ -863,15 +858,18 @@ _disinto_init_nomad() {
echo "[deploy] vault-runner: jobspec not found, skipping"
fi
# Build custom images dry-run (if agents or edge services are included)
if echo ",$with_services," | grep -qE ",(agents|edge),"; then
# Build custom images dry-run (if agents, chat, or edge services are included)
if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then
echo ""
echo "── Build images dry-run ──────────────────────────────"
if echo ",$with_services," | grep -q ",agents,"; then
echo "[build] [dry-run] docker build -t disinto/agents:local -f ${FACTORY_ROOT}/docker/agents/Dockerfile ${FACTORY_ROOT}"
fi
if echo ",$with_services," | grep -q ",chat,"; then
echo "[build] [dry-run] docker build -t disinto/chat:local -f ${FACTORY_ROOT}/docker/chat/Dockerfile ${FACTORY_ROOT}/docker/chat"
fi
if echo ",$with_services," | grep -q ",edge,"; then
echo "[build] [dry-run] docker build -t disinto/edge:local -f ${FACTORY_ROOT}/docker/edge/Dockerfile ${FACTORY_ROOT}"
echo "[build] [dry-run] docker build -t disinto/edge:local -f ${FACTORY_ROOT}/docker/edge/Dockerfile ${FACTORY_ROOT}/docker/edge"
fi
fi
exit 0
@ -964,7 +962,7 @@ _disinto_init_nomad() {
# Build custom images required by Nomad jobs (S4.2, S5.2, S5.5) — before deploy.
# Single-node factory dev box: no multi-node pull needed, no registry auth.
# Can upgrade to approach B (registry push/pull) later if multi-node.
if echo ",$with_services," | grep -qE ",(agents|edge),"; then
if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then
echo ""
echo "── Building custom images ─────────────────────────────"
if echo ",$with_services," | grep -q ",agents,"; then
@ -972,10 +970,15 @@ _disinto_init_nomad() {
echo "── Building $tag ─────────────────────────────"
docker build -t "$tag" -f "${FACTORY_ROOT}/docker/agents/Dockerfile" "${FACTORY_ROOT}" 2>&1 | tail -5
fi
if echo ",$with_services," | grep -q ",chat,"; then
local tag="disinto/chat:local"
echo "── Building $tag ─────────────────────────────"
docker build -t "$tag" -f "${FACTORY_ROOT}/docker/chat/Dockerfile" "${FACTORY_ROOT}/docker/chat" 2>&1 | tail -5
fi
if echo ",$with_services," | grep -q ",edge,"; then
local tag="disinto/edge:local"
echo "── Building $tag ─────────────────────────────"
docker build -t "$tag" -f "${FACTORY_ROOT}/docker/edge/Dockerfile" "${FACTORY_ROOT}" 2>&1 | tail -5
docker build -t "$tag" -f "${FACTORY_ROOT}/docker/edge/Dockerfile" "${FACTORY_ROOT}/docker/edge" 2>&1 | tail -5
fi
fi
@ -2594,15 +2597,15 @@ disinto_ci_logs() {
fi
}
# ── role command ─────────────────────────────────────────────────────────────
# Manage role state files (enable/disable roles)
# Usage: disinto role <subcommand> [role-name]
# disable <role> Remove state file to disable role
# enable <role> Create state file to enable role
# disable --all Disable all roles
# enable --all Enable all roles
# status Show enabled/disabled roles
disinto_role() {
# ── agent command ─────────────────────────────────────────────────────────────
# Manage agent state files (enable/disable agents)
# Usage: disinto agent <subcommand> [agent-name]
# disable <agent> Remove state file to disable agent
# enable <agent> Create state file to enable agent
# disable --all Disable all agents
# enable --all Enable all agents
# status Show enabled/disabled agents
disinto_agent() {
local subcmd="${1:-}"
local state_dir="${FACTORY_ROOT}/state"
local all_agents=("dev" "reviewer" "gardener" "architect" "planner" "predictor")
@ -2614,13 +2617,13 @@ disinto_role() {
disable)
local agent="${2:-}"
if [ -z "$agent" ]; then
echo "Error: role name required" >&2
echo "Usage: disinto role disable <role-name>" >&2
echo " disinto role disable --all" >&2
echo "Error: agent name required" >&2
echo "Usage: disinto agent disable <agent-name>" >&2
echo " disinto agent disable --all" >&2
exit 1
fi
if [ "$agent" = "--all" ]; then
echo "Disabling all roles..."
echo "Disabling all agents..."
for a in "${all_agents[@]}"; do
local state_file="${state_dir}/.${a}-active"
if [ -f "$state_file" ]; then
@ -2640,8 +2643,8 @@ disinto_role() {
fi
done
if [ "$valid" = false ]; then
echo "Error: unknown role '${agent}'" >&2
echo "Valid roles: ${all_agents[*]}" >&2
echo "Error: unknown agent '${agent}'" >&2
echo "Valid agents: ${all_agents[*]}" >&2
exit 1
fi
local state_file="${state_dir}/.${agent}-active"
@ -2656,13 +2659,13 @@ disinto_role() {
enable)
local agent="${2:-}"
if [ -z "$agent" ]; then
echo "Error: role name required" >&2
echo "Usage: disinto role enable <role-name>" >&2
echo " disinto role enable --all" >&2
echo "Error: agent name required" >&2
echo "Usage: disinto agent enable <agent-name>" >&2
echo " disinto agent enable --all" >&2
exit 1
fi
if [ "$agent" = "--all" ]; then
echo "Enabling all roles..."
echo "Enabling all agents..."
for a in "${all_agents[@]}"; do
local state_file="${state_dir}/.${a}-active"
if [ -f "$state_file" ]; then
@ -2682,8 +2685,8 @@ disinto_role() {
fi
done
if [ "$valid" = false ]; then
echo "Error: unknown role '${agent}'" >&2
echo "Valid roles: ${all_agents[*]}" >&2
echo "Error: unknown agent '${agent}'" >&2
echo "Valid agents: ${all_agents[*]}" >&2
exit 1
fi
local state_file="${state_dir}/.${agent}-active"
@ -2696,10 +2699,10 @@ disinto_role() {
fi
;;
status)
echo "Role Status"
echo "=========="
printf "%-12s %s\n" "ROLE" "STATUS"
printf "%-12s %s\n" "----" "------"
echo "Agent Status"
echo "============"
printf "%-12s %s\n" "AGENT" "STATUS"
printf "%-12s %s\n" "------" "------"
for a in "${all_agents[@]}"; do
local state_file="${state_dir}/.${a}-active"
local status
@ -2713,29 +2716,23 @@ disinto_role() {
;;
*)
cat <<EOF >&2
Usage: disinto role <subcommand>
Usage: disinto agent <subcommand>
Manage roles (enable/disable):
Manage agent state files (enable/disable agents):
disable <role> Remove state file to disable role
enable <role> Create state file to enable role
disable --all Disable all roles
enable --all Enable all roles
status Show which roles are enabled/disabled
disable <agent> Remove state file to disable agent
enable <agent> Create state file to enable agent
disable --all Disable all agents
enable --all Enable all agents
status Show which agents are enabled/disabled
Valid roles: dev, reviewer, gardener, architect, planner, predictor
Valid agents: dev, reviewer, gardener, architect, planner, predictor
EOF
exit 1
;;
esac
}
# ── agent command (deprecated — use 'role') ──────────────────────────────────
disinto_agent() {
echo "Warning: 'disinto agent' is deprecated, use 'disinto role' instead" >&2
disinto_role "$@"
}
# ── edge command ──────────────────────────────────────────────────────────────
# Manage edge tunnel registrations (reverse SSH tunnels to edge hosts)
# Usage: disinto edge <verb> [options]
@ -3014,214 +3011,6 @@ disinto_backup() {
esac
}
# ── validate command ─────────────────────────────────────────────────────────
# Validates CI pipeline configs and other factory artifacts.
# Usage: disinto validate [subcommand]
# lint-ci Lint .woodpecker/*.yml for missing timeouts and unsafe commands
disinto_validate() {
local subcmd="${1:-lint-ci}"
shift || true
case "$subcmd" in
lint-ci)
_validate_lint_ci "$@"
;;
*)
cat <<EOF >&2
Usage: disinto validate <subcommand>
Validate factory artifacts:
lint-ci Lint .woodpecker/*.yml for missing timeouts and unsafe commands
Subcommands:
lint-ci Check CI pipeline files for:
- Steps missing a timeout declaration
- Network-fetch commands without per-command timeouts
EOF
exit 1
;;
esac
}
# Lint .woodpecker/*.yml files for missing timeouts and unsafe commands.
#
# Checks:
# 1. Step-level timeout: every step must declare a `timeout:` value or
# inherit from a workflow-level default.
# 2. Command-level timeout: network-fetch commands (curl, wget, pip, etc.)
# must include an explicit timeout flag (e.g. --max-time, --timeout).
_validate_lint_ci() {
local lint_dir="${1:-.}"
local woodpecker_dir="${lint_dir}/.woodpecker"
local errors=0
local warnings=0
if [ ! -d "$woodpecker_dir" ]; then
echo "No .woodpecker/ directory found at ${woodpecker_dir}"
exit 0
fi
local -a yml_files=()
while IFS= read -r f; do
yml_files+=("$f")
done < <(find "$woodpecker_dir" -maxdepth 1 -name '*.yml' -o -name '*.yaml' 2>/dev/null | sort)
if [ ${#yml_files[@]} -eq 0 ]; then
echo "No .woodpecker/*.yml files found"
exit 0
fi
echo "Linting CI pipeline files in ${woodpecker_dir}..."
echo ""
for yml in "${yml_files[@]}"; do
local rel_path
rel_path=$(realpath --relative-to="$(pwd)" "$yml" 2>/dev/null || echo "$yml")
# Use Python to parse YAML and check for timeouts
local result
result=$(python3 -c "
import yaml, sys, re
with open('$yml') as f:
try:
doc = yaml.safe_load(f)
except yaml.YAMLError as e:
print(f'FATAL:YAML parse error: {e}', file=sys.stderr)
sys.exit(1)
if not isinstance(doc, dict):
sys.exit(0)
# Check workflow-level timeout default
workflow_timeout = doc.get('timeout')
if isinstance(workflow_timeout, (int, float)):
workflow_timeout = str(workflow_timeout)
errors = []
warnings = []
steps = doc.get('steps', [])
if not isinstance(steps, list):
sys.exit(0)
for step in steps:
if not isinstance(step, dict):
continue
name = step.get('name', '<unnamed>')
commands = step.get('commands', [])
if not isinstance(commands, list):
continue
# Check step-level timeout (unless workflow default exists)
if 'timeout' not in step and workflow_timeout is None:
errors.append(f'error: {name} — step has no timeout; add \`timeout: 5m\` or inherit from workflow default')
# Check commands for network-fetch without timeout flags
cmd_text = ' '.join(str(c) for c in commands)
lines = commands # check each command individually
for cmd in lines:
cmd_str = str(cmd)
# Skip comments and empty lines
stripped = cmd_str.strip()
if not stripped or stripped.startswith('#'):
continue
# Skip package manager installs (e.g., apk add ... curl)
if re.search(r'\b(apk|apt|yum|dnf|brew)\s+(add|install)\b', cmd_str):
continue
# Skip shell/python invocations (commands that execute scripts)
if re.match(r'\s*(bash|sh|zsh|python3?|node)\s', cmd_str):
continue
# Network-fetch binaries to check
# curl — check for --max-time, -m, or --connect-timeout
if re.search(r'\bcurl\b', cmd_str):
if not re.search(r'(--max-time|-m\s+\d|--connect-timeout)', cmd_str):
warnings.append(f'warning: {name}/command — curl without --max-time; consider: curl --max-time 30 ...')
# wget — check for --timeout
if re.search(r'\bwget\b', cmd_str):
if not re.search(r'--timeout=', cmd_str):
warnings.append(f'warning: {name}/command — wget without --timeout; consider: wget --timeout=30 ...')
# pip/pip3 — check for --default-timeout or --timeout
if re.search(r'\b(pip3?|pipenv)\b', cmd_str) and re.search(r'\b(install|i)\b', cmd_str):
if not re.search(r'(--default-timeout|--timeout)', cmd_str):
warnings.append(f'warning: {name}/command — pip install without --default-timeout; consider: --default-timeout 30')
# npm — check for --timeout
if re.search(r'\bnpm\b', cmd_str) and re.search(r'\b(install|add)\b', cmd_str):
if not re.search(r'--timeout', cmd_str):
warnings.append(f'warning: {name}/command — npm install without --timeout; consider: --timeout 30000')
# yarn — check for --timeout
if re.search(r'\byarn\b', cmd_str) and re.search(r'\b(add|install)\b', cmd_str):
if not re.search(r'--timeout', cmd_str):
warnings.append(f'warning: {name}/command — yarn add without --timeout; consider: --timeout 30000')
# go get — no direct timeout flag, but we warn about it
if re.search(r'\bgo\s+get\b', cmd_str):
warnings.append(f'warning: {name}/command — go get has no timeout flag; wrap in a timeout(1) command')
# cargo install — check for --timeout (cargo doesn't have one natively)
if re.search(r'\bcargo\s+install\b', cmd_str):
warnings.append(f'warning: {name}/command — cargo install has no timeout flag; wrap in a timeout(1) command')
# gem install — no timeout flag
if re.search(r'\bgem\s+install\b', cmd_str):
warnings.append(f'warning: {name}/command — gem install has no timeout flag; wrap in a timeout(1) command')
# brew install — no timeout flag
if re.search(r'\bbrew\s+install\b', cmd_str):
warnings.append(f'warning: {name}/command — brew install has no timeout flag; wrap in a timeout(1) command')
if errors:
for e in errors:
print(f'E:{e}')
if warnings:
for w in warnings:
print(f'W:{w}')
" 2>&1) || {
echo "ERROR: failed to parse $rel_path" >&2
echo "$result" >&2
exit 1
}
# Parse Python output
while IFS= read -r line; do
case "$line" in
E:*)
echo "${line#E:}" >&2
errors=$((errors + 1))
;;
W:*)
echo "${line#W:}"
warnings=$((warnings + 1))
;;
esac
done <<< "$result"
done
echo ""
echo "lint-ci: ${errors} error(s), ${warnings} warning(s)"
if [ "$errors" -gt 0 ]; then
echo ""
echo "Fix: add \`timeout:\` to each step, or set a workflow-level default at the top of the pipeline file." >&2
exit 1
fi
if [ "$warnings" -gt 0 ]; then
echo "(warnings are non-blocking — add per-command timeouts for network calls)" >&2
fi
exit 0
}
# ── Main dispatch ────────────────────────────────────────────────────────────
case "${1:-}" in
@ -3236,11 +3025,9 @@ case "${1:-}" in
ci-logs) shift; disinto_ci_logs "$@" ;;
release) shift; disinto_release "$@" ;;
hire-an-agent) shift; disinto_hire_an_agent "$@" ;;
role) shift; disinto_role "$@" ;;
agent) shift; disinto_agent "$@" ;;
edge) shift; disinto_edge "$@" ;;
backup) shift; disinto_backup "$@" ;;
validate) shift; disinto_validate "$@" ;;
-h|--help) usage ;;
*) usage ;;
esac

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# Dev Agent
**Role**: Implement issues autonomously — write code, push branches, address

View file

@ -545,7 +545,7 @@ print(cfg.get('primary_branch', 'main'))
if [ $((supervisor_iteration % SUPERVISOR_INTERVAL)) -eq 0 ] && [ "$now" -ge "$supervisor_iteration" ]; then
if ! pgrep -f "supervisor-run.sh" >/dev/null; then
log "Running supervisor (iteration ${iteration}, ${SUPERVISOR_INTERVAL}s interval) for ${toml}"
gosu agent bash -c "cd ${DISINTO_DIR} && bash supervisor/supervisor-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/supervisor/supervisor.log" 2>&1 &
gosu agent bash -c "cd ${DISINTO_DIR} && bash supervisor/supervisor-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/supervisor.log" 2>&1 &
else
log "Skipping supervisor — already running"
fi

View file

@ -124,7 +124,7 @@ if [ -f /opt/disinto/lib/git-creds.sh ]; then
fi
# Ensure log directory exists
mkdir -p /opt/disinto-logs/supervisor
mkdir -p /opt/disinto-logs
# ── Reverse tunnel (optional) ──────────────────────────────────────────
# When EDGE_TUNNEL_HOST is set, open a single reverse-SSH forward so the
@ -169,7 +169,7 @@ bash /opt/disinto/docker/edge/dispatcher.sh &
# Start supervisor loop in background
PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}"
(while true; do
bash /opt/disinto/supervisor/supervisor-run.sh "/opt/disinto/${PROJECT_TOML}" 2>&1 | tee -a /opt/disinto-logs/supervisor/supervisor.log || true
bash /opt/disinto/supervisor/supervisor-run.sh "/opt/disinto/${PROJECT_TOML}" 2>&1 | tee -a /opt/disinto-logs/supervisor.log || true
sleep 1200 # 20 minutes
done) &

View file

@ -22,8 +22,6 @@
# architect-bot: READ-ONLY on project repo (GET issues/PRs/labels for context).
# Cannot POST/PUT/PATCH/DELETE any project-repo resource.
# Write access ONLY on ops repo (branches, PRs, comments).
# DO NOT create issues on the project repo. Sub-issues are filed by
# filer-bot after sprint PR merge via the ops-filer pipeline.
# filer-bot: issues:write on project repo. Files sub-issues from merged sprint
# PRs via ops-filer pipeline. Adds in-progress label to vision issues.
#
@ -175,10 +173,6 @@ The ## Sub-issues block is parsed by the filer-bot pipeline after sprint PR merg
Each sub-issue between filer:begin/end markers becomes a Forgejo issue on the
project repo. The filer appends a decomposed-from marker to each body automatically.
CRITICAL: You DO NOT have access to the project repo API. Sub-issues are filed
by filer-bot from the sprint file after merge. Do NOT attempt to create issues
via API calls the token will 403 and the run will fail.
4. Bash creates PR:
- Create branch: architect/sprint-{pitch-number}
- Write sprint spec to sprints/{sprint-slug}.md

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# Gardener Agent
**Role**: Backlog grooming — detect duplicate issues, missing acceptance

View file

@ -1,22 +1,16 @@
[
{
"action": "edit_body",
"issue": 1150,
"body": "## Problem\n\n`supervisor-run.sh` writes its structured log to `data/logs/supervisor/supervisor.log` (directory form). The polling loop in `docker/agents/entrypoint.sh` redirects the supervisor invocation's stderr to `data/logs/supervisor.log` (singular file form, sibling of the directory). Two different paths for one component's log stream.\n\nWhy this matters: when #1120's unbound-variable abort happened, the real error landed in the singular `data/logs/supervisor.log` (the stderr-redirect path), but operators checking supervisor health looked at `data/logs/supervisor/supervisor.log` (the directory form the agent writes) and saw only `--- Supervisor run start ---` with nothing after. That dual-sink is why the failure was silent for ~48h.\n\nThis is a class-of-failure pattern: any future silent-abort in `supervisor-run.sh` will repeat the same invisibility, because the two sinks are structurally divergent. #1121 fixes the specific unbound-var root cause; this one removes the invisibility layer so the next silent-abort class surfaces immediately.\n\n## Fix\n\nUnify to a single path. Recommended: keep the directory form `data/logs/supervisor/supervisor.log` as the canonical sink, and change the entrypoint stderr redirect to append into the same file instead of a sibling.\n\nChange locations:\n\n1. **`docker/agents/entrypoint.sh`** — the line that invokes `supervisor-run.sh` and redirects stderr. Grep `supervisor-run.sh` or `supervisor.log` in the entrypoint to find it. Change from:\n ```bash\n bash supervisor/supervisor-run.sh 2> data/logs/supervisor.log\n ```\n to:\n ```bash\n bash supervisor/supervisor-run.sh 2>> data/logs/supervisor/supervisor.log\n ```\n Use `2>>` (append) not `2>` (overwrite) so a stderr abort on one iteration does not wipe the structured log written by previous iterations.\n\n2. **Audit `supervisor-run.sh` itself** for any hardcoded reference to the singular path. If found, migrate to the directory path.\n\n3. **Document the canonical sink** in `supervisor/AGENTS.md` (or the nearest AGENTS.md covering supervisor entrypoints) so the path does not re-fork in the future.\n\n## Acceptance criteria\n\n- [ ] Only one on-disk path for supervisor logs: `find data/logs -name 'supervisor*'` returns the directory form only, no sibling singular file.\n- [ ] An intentionally-failing supervisor run on a throwaway branch (e.g. add `: ${DOES_NOT_EXIST:?boom}` at the top of `supervisor-run.sh`) produces visible error output in the canonical sink on the next polling iteration.\n- [ ] No regression: normal supervisor runs continue to write the `--- Supervisor run start ---` / `--- Supervisor run done ---` markers.\n- [ ] The fix applies inside `disinto-agents` without requiring image rebuild (entrypoint mount path) — or, if image rebuild is required, that requirement is noted in the PR body.\n\n## Affected files\n\n- `docker/agents/entrypoint.sh` — change stderr redirect for supervisor invocation\n- `supervisor/supervisor-run.sh` — audit for hardcoded singular log path\n- `supervisor/AGENTS.md` — document canonical log sink\n\n## Related\n\n- #1120 — the 48h silent-abort incident that exposed the dual-sink\n- #1121 — unbound-var root-cause fix; this issue is the complementary visibility fix\n- Vision #1147 (heartbeat + self-restart for long-running loops) — forward direction; unifying the log path is the minimal precondition for any heartbeat writer to reliably emit failure breadcrumbs"
},
{
"action": "edit_body",
"issue": 1124,
"body": "## Symptom\n\nThe `caddy-validate` step in the `edge-subpath` workflow fails intermittently with:\n\n```\nGet \"http://%2Fvar%2Frun%2Fdocker.sock/v1.41/containers/wp_01KPQZ2WV7SVX68TDRC7DP2Z9M/json\": context deadline exceeded\n```\n\nExit code on the step: `126`. Downstream steps (`caddyfile-routing-test`, `test-caddyfile-routing`, etc.) get skipped, and the workflow reports `failure`.\n\nThis showed up on PR #1108 (gardener housekeeping, commit `0946ca9828`, pipeline 1597, workflow id 3470, step pid 12). Also pending-forever on the sibling workflows for PR #1112 (pipeline 1599) and PR #1113 (pipeline 1601).\n\nThe `edge-subpath` workflow is not in the required-status-contexts list (branch protection requires `ci/woodpecker/pr/ci` and `ci/woodpecker/push/ci` only), so this does not block merge by itself. But it leaves combined commit status at `failure`/`pending` and reviewer-agent gates on combined status — every legitimate review flow stalls here.\n\n## Reproduction\n\nHappens under load when multiple pipelines queue up. The step mounts the host `/var/run/docker.sock` and does Docker-in-Docker introspection; the `GET container` call times out during socket saturation.\n\n## Likely cause\n\n1. **Socket passthrough is saturated.** Nested Docker API calls exceed the default deadline during pipeline pile-up.\n2. **Woodpecker agent step timeout is too tight** for caddy-validate during busy periods.\n3. **The step code uses a short `context.WithTimeout`** that does not account for a busy Docker daemon.\n\n## Fix candidates\n\n- If the step's container-introspect is incidental, switch to polling with retry + exponential backoff and a larger overall budget (60120s).\n- If the step needs to spawn a sibling container, run caddy validate directly inside the workflow container (no docker.sock mount needed — `caddy validate` is a binary call).\n- Short-term: mark `edge-subpath` as optional or move it to a separate optional pipeline so it stops polluting combined status on otherwise-green PRs.\n\n## Acceptance criteria\n\n- [ ] A PR that passes the required `ci` workflow also produces a green (or explicitly-optional) `edge-subpath` result, with no `context deadline exceeded` in the step logs over ten consecutive runs.\n- [ ] Reviewer-agent no longer gets blocked by the `edge-subpath` workflow on merge-eligible PRs.\n- [ ] If the fix is \"mark as optional,\" the branch-protection required-contexts list is reviewed so it is clear which checks actually gate merges.\n\n## Affected files\n\n- `.woodpecker/edge-subpath.yml` — the CI pipeline defining the caddy-validate step\n- `tests/smoke-edge-subpath.sh` — the smoke test script invoked by the pipeline (if it contains the docker.sock introspection)\n\n## Context\n\nObserved 2026-04-21 during triage of why PRs were backing up in queue. WP agent restart drained the queue for most workflows; this one step remained stuck or timing out. The merged commit for #1108 shipped with this check in `failure`."
},
{
"action": "add_label",
"issue": 1124,
"label": "backlog"
},
{
"action": "comment",
"issue": 1121,
"body": "CI on PR #1143 is showing `failure` with all `null` status values — this is the edge-subpath docker.sock timeout pattern documented in #1124.\n\nThe fix in #1143 is correct (verified: adds `resolve_forge_remote` before `formula_worktree_setup`). The CI failure is environmental, not caused by this change.\n\n**Unblock path:** Once #1124 is resolved (edge-subpath caddy-validate no longer times out on docker.sock), PR #1143 should be retriable. Alternatively, if the required CI contexts (`ci/woodpecker/pr/ci`, `ci/woodpecker/push/ci`) pass, the PR can merge independently of the edge-subpath failure."
"issue": 1103,
"body": "### CI deadlock — root cause identified\n\nPR #1104 has been failing CI (exit 128 on git clone) due to a circular dependency:\n\n1. **#1080** set `FORGEJO__server__ROOT_URL=http://forgejo:3000/forge/`\n2. Forgejo now includes `/forge/` in the `clone_url` field of webhook payloads to Woodpecker\n3. Woodpecker uses that URL: `http://token@forgejo:3000/forge/disinto-admin/disinto.git`\n4. Caddy forwards `/forge/…` → `forgejo:3000/forge/…` but **without** `uri strip_prefix /forge`\n5. Forgejo's router returns 404 → git exits 128\n6. **This is the exact bug PR #1104 fixes** — the fix can't merge because CI can't clone to run the fix\n\nCaught from actual CI log:\n```\nfatal: repository '***/forge/disinto-admin/disinto.git/' not found\n```\n\nThe code change in PR #1104 (`lib/generators.sh`, `nomad/jobs/edge.hcl`) is correct and minimal. CI failure is environmental, not a code issue.\n\n**Resolution options:**\n- A: Operator manually merges PR #1104 (code is verified correct, 2-line change, same pattern as already-merged #1079)\n- B: Operator applies `uri strip_prefix /forge` live to the edge Caddy config to break the deadlock, then CI will pass on the next push\n\nFiled vault item for option A."
},
{
"action": "create_issue",
"title": "ops: manually merge PR #1104 — CI deadlock (Caddy /forge/ strip_prefix fix can't pass CI it fixes)",
"body": "## Situation\n\nPR #1104 fixes `lib/generators.sh` to add `uri strip_prefix /forge` before `reverse_proxy forgejo:3000` in the `/forge/*` Caddy handle block. This is required because Forgejo returns 404 for `/forge/*` paths without the prefix being stripped.\n\n## Circular dependency\n\nCI cannot clone the repo to run this PR because:\n\n1. `#1080` set `FORGEJO__server__ROOT_URL=http://forgejo:3000/forge/`\n2. Forgejo includes `/forge/` in the `clone_url` in webhook payloads → Woodpecker uses `http://token@forgejo:3000/forge/disinto-admin/disinto.git`\n3. Caddy forwards `/forge/...` to Forgejo **without** strip_prefix → Forgejo router returns 404\n4. `git clone` exits 128\n5. CI has failed on 4 consecutive runs (pipelines 15921595)\n\nVerified from actual CI log:\n```\nfatal: repository '***/forge/disinto-admin/disinto.git/' not found\n```\n\n## What needs to happen\n\nOne of:\n- **Option A (recommended):** Admin merges PR #1104 manually, bypassing CI. The code change is verified correct — identical pattern to already-merged `/staging/` strip (#1079). Once merged, the Caddy config regenerates and future PR clones work.\n- **Option B:** Operator applies `uri strip_prefix /forge` live to the edge Caddy config to break the deadlock, then pushes an empty commit to trigger CI retry.\n\n## Acceptance criteria\n\n- [ ] PR #1104 merged (manually or after CI passes post live-patch)\n- [ ] `curl http://localhost/forge/` returns 200\n- [ ] CI clone succeeds for subsequent PRs\n\n## Affected files\n\n- `lib/generators.sh` — the fix\n- PR #1104 on this repo",
"labels": [
"priority",
"backlog"
]
}
]

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# Shared Helpers (`lib/`)
All agents source `lib/env.sh` as their first action. Additional helpers are
@ -7,13 +7,13 @@ sourced as needed.
| File | What it provides | Sourced by |
|---|---|---|
| `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`FORGE_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `forge_api()`, `forge_api_all()` (paginates all pages; accepts optional second TOKEN parameter, defaults to `$FORGE_TOKEN`; handles invalid/empty JSON responses gracefully — returns empty on parse error instead of crashing), `woodpecker_api()`, `wpdb()`, `memory_guard()` (skips agent if RAM < threshold), `load_secret()` (secret-source abstraction see below). Auto-loads project TOML if `PROJECT_TOML` is set. Exports per-agent tokens (`FORGE_PLANNER_TOKEN`, `FORGE_GARDENER_TOKEN`, `FORGE_VAULT_TOKEN`, `FORGE_SUPERVISOR_TOKEN`, `FORGE_PREDICTOR_TOKEN`) each falls back to `$FORGE_TOKEN` if not set. **Vault-only token guard (AD-006)**: `unset GITHUB_TOKEN CLAWHUB_TOKEN` so agents never hold external-action tokens only the runner container receives them. **Container note**: when `DISINTO_CONTAINER=1`, `.env` is NOT re-sourced compose already injects env vars (including `FORGE_URL=http://forgejo:3000`) and re-sourcing would clobber them. **Save/restore scope (#364)**: only `FORGE_URL` is preserved across `.env` re-sourcing (compose injects `http://forgejo:3000`, `.env` has `http://localhost:3000`). `FORGE_TOKEN` is NOT preserved so refreshed tokens in `.env` take effect immediately. **Per-agent token override (#762)**: agent run scripts export `FORGE_TOKEN_OVERRIDE=<agent-specific-token>` BEFORE sourcing `env.sh`; `env.sh` applies this override at lines 98-100, ensuring the correct identity survives any re-sourcing of `env.sh` by nested shells or `claude -p` invocations. **Required env var**: `FORGE_PASS` bot password for git HTTP push (Forgejo 11.x rejects API tokens for `git push`, #361). **Hard preconditions (#674)**: `USER` and `HOME` must be exported by the entrypoint before sourcing. When `PROJECT_TOML` is set, `PROJECT_REPO_ROOT`, `PRIMARY_BRANCH`, and `OPS_REPO_ROOT` must also be set (by entrypoint or TOML). **`load_secret NAME [DEFAULT]` (#793)**: backend-agnostic secret resolution. Precedence: (1) `/secrets/<NAME>.env` Nomad-rendered template, (2) current environment already set by `.env.enc` / compose, (3) `secrets/<NAME>.enc` age-encrypted per-key file (decrypted on demand, cached in process env), (4) DEFAULT or empty. Consumers call `$(load_secret GITHUB_TOKEN)` instead of `${GITHUB_TOKEN}` identical behavior whether secrets come from Docker compose injection or Nomad Vault templates. | Every agent |
| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `ci_required_for_pr()` — returns 0 if PR has code files (CI required), 1 if non-code only (CI not required). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \<reason>" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". `ensure_priority_label()` — looks up (or creates) the `priority` label and returns its ID; caches in `_PRIORITY_LABEL_ID`. `ci_commit_status <sha>` — queries Woodpecker directly for CI state, falls back to forge commit status API. `ci_pipeline_number <sha>` — returns the Woodpecker pipeline number for a commit, falls back to parsing forge status `target_url`. `ci_promote <repo_id> <pipeline_num> <environment>` — promotes a pipeline to a named Woodpecker environment (vault-gated deployment: vault approves, vault-fire calls this — vault redesign in progress, see #73-#77). `ci_get_logs <pipeline_number> [--step <name>]` — reads CI logs from Woodpecker SQLite database via `lib/ci-log-reader.py`; outputs last 200 lines to stdout. Requires mounted woodpecker-data volume at /woodpecker-data. `ci_get_step_logs <pipeline_num> <step_id>` — fetches per-step logs via Woodpecker REST API (`/repos/{id}/logs/{pipeline}/{step_id}`); returns raw log data for a single step. Used by `pr_poll_ci()` to build per-workflow/per-step CI diagnostics (#1051). `ci_required_contexts([branch])` — returns newline-separated list of required status check context names from branch protection; cached per poll cycle in `_CI_REQUIRED_CONTEXTS`. `_ci_reduce_required_contexts(sha, required_contexts)` — reduces commit statuses to required contexts only; stdout: `success` \| `failure` \| `pending` (#1136). | dev-poll, review-poll, review-pr |
| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `ci_required_for_pr()` — returns 0 if PR has code files (CI required), 1 if non-code only (CI not required). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \<reason>" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". `ensure_priority_label()` — looks up (or creates) the `priority` label and returns its ID; caches in `_PRIORITY_LABEL_ID`. `ci_commit_status <sha>` — queries Woodpecker directly for CI state, falls back to forge commit status API. `ci_pipeline_number <sha>` — returns the Woodpecker pipeline number for a commit, falls back to parsing forge status `target_url`. `ci_promote <repo_id> <pipeline_num> <environment>` — promotes a pipeline to a named Woodpecker environment (vault-gated deployment: vault approves, vault-fire calls this — vault redesign in progress, see #73-#77). `ci_get_logs <pipeline_number> [--step <name>]` — reads CI logs from Woodpecker SQLite database via `lib/ci-log-reader.py`; outputs last 200 lines to stdout. Requires mounted woodpecker-data volume at /woodpecker-data. `ci_get_step_logs <pipeline_num> <step_id>` — fetches per-step logs via Woodpecker REST API (`/repos/{id}/logs/{pipeline}/{step_id}`); returns raw log data for a single step. Used by `pr_poll_ci()` to build per-workflow/per-step CI diagnostics (#1051). | dev-poll, review-poll, review-pr |
| `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) |
| `lib/ci-log-reader.py` | Python tool: reads CI logs from Woodpecker SQLite database. `<pipeline_number> [--step <name>]` — returns last 200 lines from failed steps (or specified step). Used by `ci_get_logs()` in ci-helpers.sh. Requires `WOODPECKER_DATA_DIR` (default: /woodpecker-data). | ci-helpers.sh |
| `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `FORGE_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, mirror config, etc.). Also exports `FORGE_REPO_OWNER` (the owner component of `FORGE_REPO`, e.g. `disinto-admin` from `disinto-admin/disinto`). Reads `repo_root` and `ops_repo_root` from the TOML for host-CLI callers. **Container path handling (#674)**: no longer derives `PROJECT_REPO_ROOT` or `OPS_REPO_ROOT` inside the script — container entrypoints export the correct paths before agent scripts source `env.sh`, and the `DISINTO_CONTAINER` guard (line 90) skips TOML overrides when those vars are already set. | env.sh (when `PROJECT_TOML` is set) |
| `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` / `blocked by #N` patterns. Inline scan skips fenced code blocks to prevent false positives from code examples in issue bodies. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll |
| `lib/formula-session.sh` | `acquire_run_lock()`, `load_formula()`, `load_formula_or_profile()`, `build_context_block()`, `ensure_ops_repo()`, `ops_commit_and_push()`, `build_prompt_footer()`, `build_sdk_prompt_footer()`, `formula_worktree_setup()`, `formula_prepare_profile_context()`, `formula_lessons_block()`, `profile_write_journal()`, `profile_load_lessons()`, `ensure_profile_repo()`, `_profile_has_repo()`, `_count_undigested_journals()`, `_profile_digest_journals()`, `_profile_restore_lessons()`, `_profile_commit_and_push()`, `resolve_agent_identity()`, `build_graph_section()`, `build_scratch_instruction()`, `read_scratch_context()`, `cleanup_stale_crashed_worktrees()` — shared helpers for formula-driven polling-loop agents (lock, .profile repo management, prompt assembly, worktree setup). Memory guard is provided by `memory_guard()` in `lib/env.sh` (not duplicated here). `resolve_agent_identity()` — sets `FORGE_TOKEN`, `AGENT_IDENTITY`, `FORGE_REMOTE` from per-agent token env vars and FORGE_URL remote detection. `build_graph_section()` generates the structural-analysis section (runs `lib/build-graph.py`, formats JSON output) — previously duplicated in planner-run.sh and predictor-run.sh, now shared here. `cleanup_stale_crashed_worktrees()` — thin wrapper around `worktree_cleanup_stale()` from `lib/worktree.sh` (kept for backwards compatibility). **Journal digestion guards (#702)**: `_profile_digest_journals()` respects `PROFILE_DIGEST_TIMEOUT` (default 300s) and `PROFILE_DIGEST_MAX_BATCH` (default 5 journals per run); `_profile_restore_lessons()` restores the previous lessons-learned.md on digest failure. | planner-run.sh, predictor-run.sh, gardener-run.sh, supervisor-run.sh, dev-agent.sh |
| `lib/guard.sh` | `check_active(role_name)` — reads `$FACTORY_ROOT/state/.{role_name}-active`; exits 0 (skip) if the file is absent. Factory is off by default — state files must be created to enable each role. **Logs a message to stderr** when skipping (`[check_active] SKIP: state file not found`), so role dropout is visible in loop logs. Sourced by dev-poll.sh, review-poll.sh, predictor-run.sh, supervisor-run.sh. | polling-loop entry points |
| `lib/guard.sh` | `check_active(agent_name)` — reads `$FACTORY_ROOT/state/.{agent_name}-active`; exits 0 (skip) if the file is absent. Factory is off by default — state files must be created to enable each agent. **Logs a message to stderr** when skipping (`[check_active] SKIP: state file not found`), so agent dropout is visible in loop logs. Sourced by dev-poll.sh, review-poll.sh, predictor-run.sh, supervisor-run.sh. | polling-loop entry points |
| `lib/mirrors.sh` | `mirror_push()` — pushes `$PRIMARY_BRANCH` + tags to all configured mirror remotes (fire-and-forget background pushes). Reads `MIRROR_NAMES` and `MIRROR_*` vars exported by `load-project.sh` from the `[mirrors]` TOML section. Failures are logged but never block the pipeline. `mirror_pull_register(clone_url, owner, repo_name, [interval])` — registers a Forgejo pull mirror via `POST /repos/migrate` with `mirror: true`. Creates the target repo and queues the first sync automatically. Works against empty Forgejo instances — no pre-existing content required. Used for Nomad migration cutover: point at Codeberg source, wait for sync, then proceed with `disinto init`. See [docs/mirror-bootstrap.md](../docs/mirror-bootstrap.md) for the full cutover path. Sourced by dev-poll.sh — called after every successful merge. | dev-poll.sh |
| `lib/build-graph.py` | Python tool: parses VISION.md, prerequisites.md (from ops repo), AGENTS.md, formulas/*.toml, evidence/ (from ops repo), and forge issues/labels into a NetworkX DiGraph. Runs structural analyses (orphaned objectives, stale prerequisites, thin evidence, circular deps) and outputs a JSON report. Used by `review-pr.sh` (per-PR changed-file analysis) and `predictor-run.sh` (full-project analysis) to provide structural context to Claude. | review-pr.sh, predictor-run.sh |
| `lib/secret-scan.sh` | `scan_for_secrets()` — detects potential secrets (API keys, bearer tokens, private keys, URLs with embedded credentials) in text; returns 1 if secrets found. `redact_secrets()` — replaces detected secret patterns with `[REDACTED]`. | issue-lifecycle.sh |
@ -30,7 +30,7 @@ sourced as needed.
| `lib/git-creds.sh` | Shared git credential helper configuration. `configure_git_creds([HOME_DIR] [RUN_AS_CMD])` — writes a static credential helper script and configures git globally to use password-based HTTP auth (Forgejo 11.x rejects API tokens for `git push`, #361). **Retry on cold boot (#741)**: resolves bot username from `FORGE_TOKEN` with 5 retries (exponential backoff 1-5s); fails loudly and returns 1 if Forgejo is unreachable — never falls back to a wrong hardcoded default (exports `BOT_USER` on success). `repair_baked_cred_urls([--as RUN_AS_CMD] DIR ...)` — rewrites any git remote URLs that have credentials baked in to use clean URLs instead; uses `safe.directory` bypass for root-owned repos (#671). Requires `FORGE_PASS`, `FORGE_URL`, `FORGE_TOKEN`. | entrypoints (agents, edge) |
| `lib/ops-setup.sh` | `setup_ops_repo()` — creates ops repo on Forgejo if it doesn't exist, configures bot collaborators, clones/initializes ops repo locally, seeds directory structure (vault, knowledge, evidence, sprints). Evidence subdirectories seeded: engagement/, red-team/, holdout/, evolution/, user-test/. Also seeds sprints/ for architect output. Exports `_ACTUAL_OPS_SLUG`. `migrate_ops_repo(ops_root, [primary_branch])` — idempotent migration helper that seeds missing directories and .gitkeep files on existing ops repos (pre-#407 deployments). | bin/disinto (init) |
| `lib/ci-setup.sh` | `_install_cron_impl()` — installs crontab entries for bare-metal deployments (compose mode uses polling loop instead). `_create_forgejo_oauth_app()` — generic helper to create an OAuth2 app on Forgejo (shared by Woodpecker and chat). `_create_woodpecker_oauth_impl()` — creates Woodpecker OAuth2 app (thin wrapper). `_create_chat_oauth_impl()` — creates disinto-chat OAuth2 app, writes `CHAT_OAUTH_CLIENT_ID`/`CHAT_OAUTH_CLIENT_SECRET` to `.env` (#708). `_generate_woodpecker_token_impl()` — auto-generates WOODPECKER_TOKEN via OAuth2 flow. `_activate_woodpecker_repo_impl()` — activates repo in Woodpecker. All gated by `_load_ci_context()` which validates required env vars. | bin/disinto (init) |
| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (**duplicate service detection**: tracks service names during generation, aborts with `ERROR: Duplicate service name '$name' detected` on conflict; detection state is reset between calls so idempotent reinvocation is safe, #850) (uses `codeberg.org/forgejo/forgejo:11.0` tag; `CLAUDE_BIN_DIR` volume mount removed from agents/llama services — only `reproduce` and `edge` still use the host-mounted CLI (#992); adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); subdomain fallback: `EDGE_ROUTING_MODE` (default `subpath`) and per-service `EDGE_TUNNEL_FQDN_*` vars injected into edge service (#1028); chat service rate limiting removed (#1084); chat workspace dir bind-mount: `${CHAT_WORKSPACE_DIR:-./workspace}:/var/workspace` + `CHAT_WORKSPACE_DIR` env var injected so Claude can access project working tree (#1027); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000 with `uri strip_prefix /forge` (#1103), `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) |
| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (**duplicate service detection**: tracks service names during generation, aborts with `ERROR: Duplicate service name '$name' detected` on conflict; detection state is reset between calls so idempotent reinvocation is safe, #850) (uses `codeberg.org/forgejo/forgejo:11.0` tag; `CLAUDE_BIN_DIR` volume mount removed from agents/llama services — only `reproduce` and `edge` still use the host-mounted CLI (#992); adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); subdomain fallback: `EDGE_ROUTING_MODE` (default `subpath`) and per-service `EDGE_TUNNEL_FQDN_*` vars injected into edge service (#1028); chat service rate limiting removed (#1084); chat workspace dir bind-mount: `${CHAT_WORKSPACE_DIR:-./workspace}:/var/workspace` + `CHAT_WORKSPACE_DIR` env var injected so Claude can access project working tree (#1027); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) |
| `lib/backup.sh` | Factory backup creation. `backup_create <outfile.tar.gz>` — exports factory state: fetches all issues (open+closed) from the project and ops repos via Forgejo API, bundles the ops repo as a git bundle, and writes a tarball. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_REPO`, `FORGE_OPS_REPO`, `OPS_REPO_ROOT`. Sourced by `bin/disinto backup create` (#1057). | bin/disinto (backup create) |
| `lib/disinto/backup.sh` | Factory backup restore. `backup_import <infile.tar.gz>` — restores from a backup tarball: creates missing repos via Forgejo API, imports issues (idempotent — skips by number if present), unpacks ops repo git bundle. Idempotent: running twice produces same end state with no errors. Requires `FORGE_URL`, `FORGE_TOKEN`. Sourced by `bin/disinto backup import` (#1058). | bin/disinto (backup import) |
| `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses `<!-- filer:begin --> ... <!-- filer:end -->` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `<!-- decomposed-from: #<vision>, sprint: <slug>, id: <id> -->` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) |

View file

@ -1,99 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# backfill-labels.sh — Backfill labels on issues that were filed out of band
#
# Usage:
# backfill-labels.sh <issue-num> <label> [<label> ...]
# backfill-labels.sh 1105 backlog
# backfill-labels.sh 1105 1106 1107 backlog
#
# Environment:
# FORGE_TOKEN — API token with issues:write scope (used for label operations)
# FORGE_API — project repo API base URL
#
# This script is a one-off tool for recovering from out-of-band issue filing
# (e.g., architect-bot filing sub-issues directly instead of through filer-bot).
# See issue #1140 for context.
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
if [ -z "${FACTORY_ROOT:-}" ]; then
FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
# shellcheck source=lib/env.sh
source "$FACTORY_ROOT/lib/env.sh"
fi
if [ $# -lt 2 ]; then
echo "Usage: $0 <issue-num> [<issue-num> ...] <label> [<label> ...]" >&2
echo " Last positional arg(s) are labels. All preceding args are issue numbers." >&2
exit 1
fi
# Split args: last N unique non-numeric args are labels, rest are issue numbers
args=("$@")
issue_nums=()
labels=()
for arg in "${args[@]}"; do
if [[ "$arg" =~ ^[0-9]+$ ]]; then
issue_nums+=("$arg")
else
# Check if it looks like a label (not a number)
labels+=("$arg")
fi
done
# If we have no non-numeric labels, treat the last arg as a label
if [ ${#labels[@]} -eq 0 ] && [ $# -gt 0 ]; then
labels=("${args[-1]}")
# Rebuild issue_nums from all non-label args
for arg in "${args[@]:0:$(($# - 1))}"; do
issue_nums+=("$arg")
done
fi
if [ ${#issue_nums[@]} -eq 0 ]; then
echo "ERROR: no issue numbers specified" >&2
exit 1
fi
if [ ${#labels[@]} -eq 0 ]; then
echo "ERROR: no labels specified" >&2
exit 1
fi
# Resolve label IDs
label_ids_json="[]"
for label_name in "${labels[@]}"; do
label_id=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
"${FORGE_API}/labels" 2>/dev/null | jq -r --arg name "$label_name" \
'.[] | select(.name == $name) | .id' 2>/dev/null) || true
if [ -n "$label_id" ]; then
label_ids_json=$(printf '%s' "$label_ids_json" | jq --argjson id "$label_id" '. + [$id]')
else
echo "WARNING: label '${label_name}' not found on project repo" >&2
fi
done
if [ "$(printf '%s' "$label_ids_json" | jq 'length')" -eq 0 ]; then
echo "ERROR: no label IDs resolved — cannot proceed" >&2
exit 1
fi
# Apply labels to each issue
for issue_num in "${issue_nums[@]}"; do
echo "Adding labels ${labels[*]} to issue #${issue_num}..."
if ! curl -sf -X POST \
-H "Authorization: token ${FORGE_TOKEN}" \
-H "Content-Type: application/json" \
"${FORGE_API}/issues/${issue_num}/labels" \
-d "{\"labels\": $(printf '%s' "$label_ids_json")}" 2>/dev/null; then
echo "ERROR: failed to add labels to issue #${issue_num}" >&2
continue
fi
echo " OK — issue #${issue_num} updated"
done
echo "Done."

View file

@ -56,64 +56,6 @@ ci_required_for_pr() {
echo "$files" | diff_has_code_files
}
# ci_required_contexts [branch] — get required status check contexts from branch protection.
# Cached per poll cycle (module-level variable) to avoid repeated API calls.
# Stdout: newline-separated list of required context names, or empty if none configured.
# shellcheck disable=SC2120 # branch arg is optional, callers may omit it
ci_required_contexts() {
if [ -n "${_CI_REQUIRED_CONTEXTS+set}" ]; then
printf '%s' "$_CI_REQUIRED_CONTEXTS"
return
fi
local branch="${1:-${PRIMARY_BRANCH:-main}}"
local bp_json
bp_json=$(forge_api GET "/branch_protections/${branch}" 2>/dev/null) || bp_json=""
if [ -z "$bp_json" ] || [ "$bp_json" = "null" ]; then
_CI_REQUIRED_CONTEXTS=""
printf '%s' "$_CI_REQUIRED_CONTEXTS"
return
fi
local enabled
enabled=$(printf '%s' "$bp_json" | jq -r '.enable_status_check // false' 2>/dev/null) || enabled="false"
if [ "$enabled" != "true" ]; then
_CI_REQUIRED_CONTEXTS=""
printf '%s' "$_CI_REQUIRED_CONTEXTS"
return
fi
_CI_REQUIRED_CONTEXTS=$(printf '%s' "$bp_json" \
| jq -r '.status_check_contexts // [] | .[]' 2>/dev/null) || _CI_REQUIRED_CONTEXTS=""
printf '%s' "$_CI_REQUIRED_CONTEXTS"
}
# _ci_reduce_required_contexts <sha> <required_contexts>
# Reduce commit statuses to required contexts only.
# Fetches per-context statuses from the forge combined endpoint and filters.
# Stdout: success | failure | pending
_ci_reduce_required_contexts() {
local sha="$1" required="$2"
local status_json
status_json=$(forge_api GET "/commits/${sha}/status" 2>/dev/null) || { echo "pending"; return; }
printf '%s' "$status_json" | jq -r --arg req "$required" '
($req | split("\n") | map(select(. != ""))) as $contexts |
.statuses as $all |
if ($contexts | length) == 0 then "pending"
else
[ $contexts[] as $ctx |
[$all[] | select(.context == $ctx)] | sort_by(.id) | last | .status // "pending"
] |
if any(. == "failure" or . == "error") then "failure"
elif all(. == "success") then "success"
else "pending"
end
end
' 2>/dev/null || echo "pending"
}
# ci_passed <state> — check if CI is passing (or no CI configured)
# Returns 0 if state is "success", or if no CI is configured and
# state is empty/pending/unknown.
@ -141,23 +83,11 @@ ci_failed() {
}
# ci_commit_status <sha> — get CI state for a commit
# When branch protection declares required status check contexts, reduces over
# just those — optional workflows that are stuck/failed do not block decisions.
# Otherwise queries Woodpecker API directly, falls back to forge combined status.
# Queries Woodpecker API directly, falls back to forge commit status API.
ci_commit_status() {
local sha="$1"
local state=""
# When required contexts are configured, reduce over just those
local required
# shellcheck disable=SC2119 # branch arg defaults to PRIMARY_BRANCH
required=$(ci_required_contexts) || true
if [ -n "$required" ]; then
_ci_reduce_required_contexts "$sha" "$required"
return
fi
# No required-context filtering — original behavior
# Primary: ask Woodpecker directly
if [ -n "${WOODPECKER_REPO_ID:-}" ] && [ "${WOODPECKER_REPO_ID}" != "0" ]; then
state=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines" \

View file

@ -860,7 +860,6 @@ _generate_caddyfile_subpath() {
# Reverse proxy to Forgejo
handle /forge/* {
uri strip_prefix /forge
reverse_proxy forgejo:3000
}

View file

@ -1,22 +1,22 @@
#!/usr/bin/env bash
# guard.sh — Active-state guard for polling-loop entry points
#
# Each role checks for a state file before running. If the file
# doesn't exist, the role logs a skip and exits cleanly.
# Each agent checks for a state file before running. If the file
# doesn't exist, the agent logs a skip and exits cleanly.
#
# State files live in $FACTORY_ROOT/state/:
# .dev-active, .reviewer-active, .planner-active, etc.
#
# Presence = permission to run. Absence = skip (factory off by default).
# check_active <role_name>
# check_active <agent_name>
# Exit 0 (skip) if the state file is absent.
check_active() {
local role_name="$1"
local state_file="${FACTORY_ROOT}/state/.${role_name}-active"
local agent_name="$1"
local state_file="${FACTORY_ROOT}/state/.${agent_name}-active"
if [ ! -f "$state_file" ]; then
echo "[check_active] SKIP: state file state/.${role_name}-active not found — role disabled" >&2
log "${role_name} not active — skipping"
echo "[check_active] SKIP: state file state/.${agent_name}-active not found — agent disabled" >&2
log "${agent_name} not active — skipping"
exit 0
fi
}

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# nomad/ — Agent Instructions
Nomad + Vault HCL for the factory's single-node cluster. These files are
@ -21,7 +21,7 @@ see issues #821#992 for the step breakdown.
| `jobs/agents.hcl` | submitted via `lib/init/nomad/deploy.sh` | All 7 agent roles (dev, review, gardener, planner, predictor, supervisor, architect) + llama variant; Vault-templated bot tokens via `service-agents` policy; `force_pull = false` — image is built locally by `bin/disinto --with agents`, no registry (S4.1, S4-fix-2, S4-fix-5, #955, #972, #978) |
| `jobs/staging.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy file-server mounting `docker/` as `/srv/site:ro`; no Vault integration; **dynamic host port** (no static 80 — edge owns 80/443, collision fixed in S5-fix-7 #1018); edge discovers via Nomad service registration (S5.2, #989) |
| `jobs/chat.hcl` | submitted via `lib/init/nomad/deploy.sh` | Claude chat UI; custom `disinto/chat:local` image; sandbox hardening (cap_drop ALL, **tmpfs via mount block** not `tmpfs=` arg — S5-fix-5 #1012, pids_limit 128); Vault-templated OAuth secrets via `service-chat` policy (S5.2, #989); rate limiting removed (#1084); **workspace volume** `chat-workspace` host_volume bind-mounted to `/var/workspace` for Claude project access (#1027) — operator must register `host_volume "chat-workspace"` in `client.hcl` on each node |
| `jobs/edge.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy reverse proxy + dispatcher sidecar; routes /forge, /woodpecker, /staging, /chat; uses `disinto/edge:local` image built by `bin/disinto --with edge`; **both Caddy and dispatcher tasks use `network_mode = "host"`** — upstreams are `127.0.0.1:<port>` (forgejo :3000, woodpecker :8000, chat :8080), not Docker hostnames (#1031, #1034); `FORGE_URL` rendered via Nomad service discovery template (`nomadService "forgejo"` — switched from Consul `service` lookup to Nomad native service discovery, #1114) to handle bridge vs. host network differences (#1034); dispatcher Vault secret path changed to `kv/data/disinto/shared/ops-repo` (#1041); Vault-templated ops-repo creds via `service-dispatcher` policy (S5.1, #988); `/forge/*` handler adds `uri strip_prefix /forge` before proxying to forgejo (#1103); `/staging/*` strips `/staging` prefix before proxying (#1079); WebSocket endpoint `/chat/ws` uses `header_up` inside `reverse_proxy` block (moved from handle-block top level — Caddy rejects top-level `header_up`, #1117); `/chat/ws` added for streaming (#1026) |
| `jobs/edge.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy reverse proxy + dispatcher sidecar; routes /forge, /woodpecker, /staging, /chat; uses `disinto/edge:local` image built by `bin/disinto --with edge`; **both Caddy and dispatcher tasks use `network_mode = "host"`** — upstreams are `127.0.0.1:<port>` (forgejo :3000, woodpecker :8000, chat :8080), not Docker hostnames (#1031, #1034); `FORGE_URL` rendered via Nomad service discovery template (not static env) to handle bridge vs. host network differences (#1034); dispatcher Vault secret path changed to `kv/data/disinto/shared/ops-repo` (#1041); Vault-templated ops-repo creds via `service-dispatcher` policy (S5.1, #988); `/staging/*` strips `/staging` prefix before proxying (#1079); WebSocket endpoint `/chat/ws` added for streaming (#1026) |
Nomad auto-merges every `*.hcl` under `-config=/etc/nomad.d/`, so the
split between `server.hcl` and `client.hcl` is for readability, not

View file

@ -6,10 +6,10 @@
# dispatcher sidecar polls disinto-ops for vault actions and dispatches them
# via Nomad batch jobs.
#
# All upstreams discovered via Nomad service discovery (issue #1156, S5-fix-7).
# Caddy uses network_mode = "host" but upstreams run in separate alloc netns,
# so loopback addresses are unreachable nomadService templates resolve the
# dynamic address:port for each backend.
# Host networking (issue #1031):
# Caddy uses network_mode = "host" so upstreams are reached at
# 127.0.0.1:<port> (forgejo :3000, woodpecker :8000, chat :8080).
# Staging uses Nomad service discovery (S5-fix-7, issue #1018).
#
# Host_volume contract:
# This job mounts caddy-data from nomad/client.hcl. Path
@ -120,15 +120,17 @@ job "edge" {
read_only = false
}
# Caddyfile via Nomad service discovery (S5-fix-7, issue #1018/1156)
# All upstreams rendered from Nomad service registration. Caddy picks up
# /local/Caddyfile via entrypoint.
# Caddyfile via Nomad service discovery (S5-fix-7, issue #1018)
# Renders staging upstream from Nomad service registration instead of
# hardcoded staging:80. Caddy picks up /local/Caddyfile via entrypoint.
# Forge URL via Nomad service discovery (issue #1034) resolves forgejo
# service address/port dynamically for bridge network compatibility.
template {
destination = "local/forge.env"
env = true
change_mode = "restart"
data = <<EOT
{{ range nomadService "forgejo" -}}
{{ range service "forgejo" -}}
FORGE_URL=http://{{ .Address }}:{{ .Port }}
{{- end }}
EOT
@ -147,16 +149,15 @@ EOT
redir /forge/ 302
}
# Reverse proxy to Forgejo dynamic via Nomad service discovery (#1156)
# Reverse proxy to Forgejo
handle /forge/* {
uri strip_prefix /forge
{{ range nomadService "forgejo" }} reverse_proxy {{ .Address }}:{{ .Port }}
{{ end }} }
reverse_proxy 127.0.0.1:3000
}
# Reverse proxy to Woodpecker CI dynamic via Nomad service discovery (#1156)
# Reverse proxy to Woodpecker CI
handle /ci/* {
{{ range nomadService "woodpecker" }} reverse_proxy {{ .Address }}:{{ .Port }}
{{ end }} }
reverse_proxy 127.0.0.1:8000
}
# Reverse proxy to staging dynamic port via Nomad service discovery
handle /staging/* {
@ -164,30 +165,29 @@ EOT
{{ range nomadService "staging" }} reverse_proxy {{ .Address }}:{{ .Port }}
{{ end }} }
# Chat service reverse proxy to disinto-chat backend (#705, #1156)
# Chat service reverse proxy to disinto-chat backend (#705)
# OAuth routes bypass forward_auth unauthenticated users need these (#709)
handle /chat/login {
{{ range nomadService "chat" }} reverse_proxy {{ .Address }}:{{ .Port }}
{{ end }} }
reverse_proxy 127.0.0.1:8080
}
handle /chat/oauth/callback {
{{ range nomadService "chat" }} reverse_proxy {{ .Address }}:{{ .Port }}
{{ end }} }
reverse_proxy 127.0.0.1:8080
}
# WebSocket endpoint for streaming (#1026)
handle /chat/ws {
{{ range nomadService "chat" }} reverse_proxy {{ .Address }}:{{ .Port }} {
header_up Upgrade {http.request.header.Upgrade}
header_up Connection {http.request.header.Connection}
}
{{ end }} }
header_up Upgrade $http.upgrade
header_up Connection $http.connection
reverse_proxy 127.0.0.1:8080
}
# Defense-in-depth: forward_auth stamps X-Forwarded-User from session (#709)
handle /chat/* {
{{ range nomadService "chat" }} forward_auth {{ .Address }}:{{ .Port }} {
forward_auth 127.0.0.1:8080 {
uri /chat/auth/verify
copy_headers X-Forwarded-User
header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}
}
reverse_proxy {{ .Address }}:{{ .Port }}
{{ end }} }
reverse_proxy 127.0.0.1:8080
}
}
EOT
}
@ -241,7 +241,7 @@ EOT
env = true
change_mode = "restart"
data = <<EOT
{{ range nomadService "forgejo" -}}
{{ range service "forgejo" -}}
FORGE_URL=http://{{ .Address }}:{{ .Port }}
{{- end }}
EOT

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# Planner Agent
**Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints),

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# Predictor Agent
**Role**: Abstract adversary (the "goblin"). Runs a 2-step formula

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# Review Agent
**Role**: AI-powered PR review — post structured findings and formal

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# Supervisor Agent
**Role**: Health monitoring and auto-remediation, executed as a formula-driven
@ -8,7 +8,7 @@ issues, and writes a daily journal. When blocked on external
resources or human decisions, files vault items instead of escalating directly.
**Trigger**: `supervisor-run.sh` is invoked by two polling loops:
- **Agents container** (`docker/agents/entrypoint.sh`): every `SUPERVISOR_INTERVAL` seconds (default 1200 = 20 min). Controlled by the `supervisor` role in `AGENT_ROLES` (included in the default seven-role set since P1/#801). Logs to `data/logs/supervisor/supervisor.log` (canonical sink — both `supervisor-run.sh` internal logging and entrypoint stderr redirect write to this single file).
- **Agents container** (`docker/agents/entrypoint.sh`): every `SUPERVISOR_INTERVAL` seconds (default 1200 = 20 min). Controlled by the `supervisor` role in `AGENT_ROLES` (included in the default seven-role set since P1/#801). Logs to `supervisor.log` in the agents container.
- **Edge container** (`docker/edge/entrypoint-edge.sh`): separate loop in the edge container (line 169-172). Runs independently of the agents container's polling schedule.
Both invoke the same `supervisor-run.sh`. Sources `lib/guard.sh` and calls `check_active supervisor` first — skips if `$FACTORY_ROOT/state/.supervisor-active` is absent. Then runs `claude -p` via `agent-sdk.sh`, injects `formulas/run-supervisor.toml` with pre-collected metrics as context, and cleans up on completion or timeout.
@ -39,11 +39,6 @@ Both invoke the same `supervisor-run.sh`. Sources `lib/guard.sh` and calls `chec
- `$OPS_REPO_ROOT/knowledge/*.md` — Domain-specific remediation guides (memory,
disk, CI, git, dev-agent, review-agent, forge)
**Canonical log sink**: `data/logs/supervisor/supervisor.log` — all supervisor output
(structured log from `supervisor-run.sh` and stderr from the entrypoint invocation)
goes to this single file. Do not introduce a second path; see #1150 for the dual-sink
incident that motivated unification.
**Alert priorities**: P0 (memory crisis), P1 (disk), P2 (factory stopped/stalled),
P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping).

View file

@ -1,13 +0,0 @@
# Test fixture: curl without --max-time should trigger a warning
# Used by tests/test-lint-ci.bats to verify the command-level timeout check
when:
- event: pull_request
timeout: 5m
steps:
- name: bad-curl
image: alpine:3
commands:
- curl https://example.com

View file

@ -1,13 +0,0 @@
# Test fixture: curl with --max-time should pass cleanly
# Used by tests/test-lint-ci.bats to verify the command-level timeout check
when:
- event: pull_request
timeout: 5m
steps:
- name: good-curl
image: alpine:3
commands:
- curl --max-time 30 https://example.com

View file

@ -1,11 +0,0 @@
# Test fixture: step without timeout should trigger an error
# Used by tests/test-lint-ci.bats to verify the step-level timeout check
when:
- event: pull_request
steps:
- name: no-timeout-step
image: alpine:3
commands:
- echo "this step has no timeout"

View file

@ -1,13 +0,0 @@
# Test fixture: workflow-level timeout should satisfy all steps
# Used by tests/test-lint-ci.bats to verify workflow-level timeout propagation
when:
- event: pull_request
timeout: 10m
steps:
- name: inherits-timeout
image: alpine:3
commands:
- echo "inherits workflow timeout"

View file

@ -1,233 +0,0 @@
#!/usr/bin/env bats
# =============================================================================
# tests/lib-ci-required-contexts.bats — Unit tests for ci_required_contexts()
# and the required-context reducer in ci_commit_status().
#
# Verifies that when branch protection declares required status check contexts,
# ci_commit_status() reduces over just those — optional workflows that are
# stuck/failed do not block decisions (#1136).
#
# Uses a curl shim to return canned forge API responses.
# =============================================================================
setup() {
ROOT="$(cd "$(dirname "$BATS_TEST_FILENAME")/.." && pwd)"
export FACTORY_ROOT="$ROOT"
export FORGE_TOKEN="dummy-token"
export FORGE_URL="https://forge.example.test"
export FORGE_API="${FORGE_URL}/api/v1/repos/owner/repo"
export PRIMARY_BRANCH="main"
export WOODPECKER_REPO_ID="0" # disable Woodpecker path
# Reset cache between tests
unset _CI_REQUIRED_CONTEXTS
export CALLS_LOG="${BATS_TEST_TMPDIR}/curl-calls.log"
: > "$CALLS_LOG"
# Mock forge_api — mirrors lib/env.sh shape
forge_api() {
local method="$1" path="$2"
shift 2
curl -sf -X "$method" \
-H "Authorization: token ${FORGE_TOKEN}" \
-H "Content-Type: application/json" \
"${FORGE_API}${path}" "$@"
}
# Mock forge_api_all (used by some ci-helpers functions)
forge_api_all() {
forge_api GET "$1"
}
# Mock woodpecker_api (not used when WOODPECKER_REPO_ID=0, but needed for source)
woodpecker_api() { return 1; }
# Default mock responses — overridden per test
# Branch protection: status checks enabled, "ci" is required
export MOCK_BP_ENABLED="true"
export MOCK_BP_CONTEXTS='["ci"]'
# Commit statuses: "ci" success, "edge-subpath" pending
export MOCK_STATUSES='[
{"id":1,"context":"ci","status":"success","created_at":"2026-01-01T00:00:00Z"},
{"id":2,"context":"edge-subpath","status":"pending","created_at":"2026-01-01T00:00:01Z"}
]'
curl() {
local method="GET" url="" arg
while [ $# -gt 0 ]; do
arg="$1"
case "$arg" in
-X) method="$2"; shift 2 ;;
-H|-d|--data-binary|-o) shift 2 ;;
-w) shift 2 ;;
-sf|-s|-f|--silent|--fail) shift ;;
*) url="$arg"; shift ;;
esac
done
printf '%s %s\n' "$method" "$url" >> "$CALLS_LOG"
case "$url" in
*"/branch_protections/"*)
printf '{"enable_status_check":%s,"status_check_contexts":%s}' \
"$MOCK_BP_ENABLED" "$MOCK_BP_CONTEXTS"
;;
*"/commits/"*"/status")
printf '{"state":"pending","statuses":%s}' "$MOCK_STATUSES"
;;
*)
return 1
;;
esac
return 0
}
source "${ROOT}/lib/ci-helpers.sh"
}
# ── ci_required_contexts tests ───────────────────────────────────────────────
@test "ci_required_contexts returns context list when status checks enabled" {
run ci_required_contexts
[ "$status" -eq 0 ]
[[ "$output" == "ci" ]]
}
@test "ci_required_contexts returns empty when status checks disabled" {
export MOCK_BP_ENABLED="false"
unset _CI_REQUIRED_CONTEXTS
run ci_required_contexts
[ "$status" -eq 0 ]
[ -z "$output" ]
}
@test "ci_required_contexts returns empty when branch protection not found" {
curl() {
return 1
}
unset _CI_REQUIRED_CONTEXTS
run ci_required_contexts
[ "$status" -eq 0 ]
[ -z "$output" ]
}
@test "ci_required_contexts caches result across calls" {
ci_required_contexts >/dev/null
ci_required_contexts >/dev/null
# Only one API call despite two invocations
local call_count
call_count=$(grep -c "branch_protections" "$CALLS_LOG" 2>/dev/null || echo 0)
[ "$call_count" -eq 1 ]
}
@test "ci_required_contexts returns multiple contexts" {
export MOCK_BP_CONTEXTS='["ci","lint"]'
unset _CI_REQUIRED_CONTEXTS
run ci_required_contexts
[ "$status" -eq 0 ]
[[ "$output" == *"ci"* ]]
[[ "$output" == *"lint"* ]]
}
# ── ci_commit_status with required contexts ──────────────────────────────────
@test "ci_commit_status returns success when required context passes (optional pending)" {
# "ci" is success, "edge-subpath" is pending — should report success
run ci_commit_status "abc123"
[ "$status" -eq 0 ]
[[ "$output" == "success" ]]
}
@test "ci_commit_status returns failure when required context fails (optional success)" {
export MOCK_STATUSES='[
{"id":1,"context":"ci","status":"failure","created_at":"2026-01-01T00:00:00Z"},
{"id":2,"context":"edge-subpath","status":"success","created_at":"2026-01-01T00:00:01Z"}
]'
unset _CI_REQUIRED_CONTEXTS
run ci_commit_status "abc123"
[ "$status" -eq 0 ]
[[ "$output" == "failure" ]]
}
@test "ci_commit_status returns pending when required context has no status yet" {
export MOCK_STATUSES='[
{"id":1,"context":"edge-subpath","status":"success","created_at":"2026-01-01T00:00:00Z"}
]'
unset _CI_REQUIRED_CONTEXTS
run ci_commit_status "abc123"
[ "$status" -eq 0 ]
[[ "$output" == "pending" ]]
}
@test "ci_commit_status returns success when all required contexts pass" {
export MOCK_BP_CONTEXTS='["ci","lint"]'
export MOCK_STATUSES='[
{"id":1,"context":"ci","status":"success","created_at":"2026-01-01T00:00:00Z"},
{"id":2,"context":"lint","status":"success","created_at":"2026-01-01T00:00:01Z"},
{"id":3,"context":"edge-subpath","status":"failure","created_at":"2026-01-01T00:00:02Z"}
]'
unset _CI_REQUIRED_CONTEXTS
run ci_commit_status "abc123"
[ "$status" -eq 0 ]
[[ "$output" == "success" ]]
}
@test "ci_commit_status returns failure when any required context fails" {
export MOCK_BP_CONTEXTS='["ci","lint"]'
export MOCK_STATUSES='[
{"id":1,"context":"ci","status":"success","created_at":"2026-01-01T00:00:00Z"},
{"id":2,"context":"lint","status":"error","created_at":"2026-01-01T00:00:01Z"},
{"id":3,"context":"edge-subpath","status":"success","created_at":"2026-01-01T00:00:02Z"}
]'
unset _CI_REQUIRED_CONTEXTS
run ci_commit_status "abc123"
[ "$status" -eq 0 ]
[[ "$output" == "failure" ]]
}
@test "ci_commit_status uses latest status per context (re-run overwrites)" {
export MOCK_STATUSES='[
{"id":1,"context":"ci","status":"failure","created_at":"2026-01-01T00:00:00Z"},
{"id":3,"context":"ci","status":"success","created_at":"2026-01-01T00:01:00Z"}
]'
unset _CI_REQUIRED_CONTEXTS
run ci_commit_status "abc123"
[ "$status" -eq 0 ]
[[ "$output" == "success" ]]
}
# ── incident reproduction shape ──────────────────────────────────────────────
@test "incident shape: required ci passes, optional edge-subpath stuck pending — returns success" {
# This is the exact scenario from the 2026-04-21 incident:
# - "ci" workflow: success
# - "edge-subpath" (optional): stuck pending
# - Combined state would be "pending" (worst of all)
# - With fix: only "ci" matters → success
export MOCK_BP_CONTEXTS='["ci"]'
export MOCK_STATUSES='[
{"id":1,"context":"ci","status":"success","created_at":"2026-01-01T00:00:00Z"},
{"id":2,"context":"edge-subpath","status":"pending","created_at":"2026-01-01T00:00:01Z"},
{"id":3,"context":"caddy-validate","status":"failure","created_at":"2026-01-01T00:00:02Z"}
]'
unset _CI_REQUIRED_CONTEXTS
run ci_commit_status "abc123"
[ "$status" -eq 0 ]
[[ "$output" == "success" ]]
}
# ── fallback: no required contexts → original behavior ───────────────────────
@test "ci_commit_status falls back to combined state when no required contexts" {
export MOCK_BP_ENABLED="false"
export WOODPECKER_REPO_ID="0"
unset _CI_REQUIRED_CONTEXTS
# Combined state is "pending" (from MOCK_STATUSES default)
# Without required contexts, falls through to forge combined .state
run ci_commit_status "abc123"
[ "$status" -eq 0 ]
# Falls back to .state from combined endpoint → "pending"
[[ "$output" == "pending" ]]
}

View file

@ -89,13 +89,6 @@ check_forgejo_routing() {
tr_fail "Missing Forgejo handle block (handle /forge/*)"
fi
# Check uri strip_prefix /forge (required for Forgejo routing)
if echo "$CADDYFILE" | grep -q "uri strip_prefix /forge"; then
tr_pass "Forgejo strip_prefix configured (/forge)"
else
tr_fail "Missing Forgejo strip_prefix (/forge)"
fi
# Check reverse_proxy to Forgejo on port 3000
if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:3000"; then
tr_pass "Forgejo reverse_proxy configured (127.0.0.1:3000)"

View file

@ -1,52 +0,0 @@
# tests/test-lint-ci.bats — Tests for `disinto validate lint-ci`
#
# Verifies the CI timeout validator:
# 1. Step-level timeout errors fire when missing
# 2. Workflow-level timeout satisfies all steps
# 3. curl without --max-time triggers a warning
# 4. curl with --max-time passes cleanly
load bats
DISINTO="${FACTORY_ROOT:-$(cd "$(dirname "$0")/.." && pwd)}/bin/disinto"
FIXTURES="$(cd "$(dirname "$0")/fixtures/lint-ci" && pwd)"
# ── Step-level timeout errors ────────────────────────────────────────────────
@test "missing step timeout triggers error" {
local output
output=$(bash "$DISINTO" validate lint-ci "$FIXTURES/missing-timeout" 2>&1)
local rc=$?
echo "$output"
[ "$rc" -eq 1 ]
echo "$output" | grep -q "error:.*no-timeout-step.*step has no timeout"
}
@test "workflow-level timeout satisfies all steps" {
local output
output=$(bash "$DISINTO" validate lint-ci "$FIXTURES/workflow-timeout" 2>&1)
local rc=$?
echo "$output"
[ "$rc" -eq 0 ]
echo "$output" | grep -q "lint-ci: 0 error(s), 0 warning(s)"
}
# ── Command-level timeout warnings ───────────────────────────────────────────
@test "curl without --max-time triggers warning" {
local output
output=$(bash "$DISINTO" validate lint-ci "$FIXTURES/bad-curl" 2>&1)
local rc=$?
echo "$output"
[ "$rc" -eq 0 ]
echo "$output" | grep -q "warning:.*curl without --max-time"
}
@test "curl with --max-time passes cleanly" {
local output
output=$(bash "$DISINTO" validate lint-ci "$FIXTURES/good-curl" 2>&1)
local rc=$?
echo "$output"
[ "$rc" -eq 0 ]
echo "$output" | grep -q "lint-ci: 0 error(s), 0 warning(s)"
}

View file

@ -183,16 +183,11 @@ Shows all registered tunnels with their ports and FQDNs.
## Allowlist
The allowlist prevents project name squatting by requiring admin approval before a name can be registered. It is **opt-in**: when `allowlist.json` does not exist, registration is unrestricted. When the file exists, only project names listed in the `allowed` map can be registered.
The allowlist prevents project name squatting by requiring admin approval before a name can be registered. It is **opt-in**: when `allowlist.json` is empty (no project entries), registration works as before. Once the admin adds entries, only approved names are accepted.
### Install-time behavior
### Setup
- **Fresh install**: `install.sh` seeds an empty allowlist (`{"version":1,"allowed":{}}`) and prints a warning that registration is now gated until entries are added.
- **Upgrade onto an existing box**: if `registry.json` has registered projects but `allowlist.json` does not exist, `install.sh` auto-populates the allowlist with each existing project name (unbound — `pubkey_fingerprint: ""`). This preserves current behavior so existing tunnels keep working. The operator can tighten pubkey bindings later.
### Format
`/var/lib/disinto/allowlist.json` (root-owned, `0644`):
Edit `/var/lib/disinto/allowlist.json` as root:
```json
{
@ -208,9 +203,9 @@ The allowlist prevents project name squatting by requiring admin approval before
}
```
- **With `pubkey_fingerprint`** (non-empty): only the SSH key with that exact SHA256 fingerprint can register this project name.
- **With empty `pubkey_fingerprint`**: any caller may register this project name (name reservation without key binding).
- **Not listed in `allowed`**: registration is refused with `{"error":"name not approved"}`.
- **With `pubkey_fingerprint`**: Only the specified SSH key can register this project name. The fingerprint is the SHA256 output of `ssh-keygen -lf <keyfile>`.
- **With empty `pubkey_fingerprint`**: Any caller may register this project name (name reservation without key binding).
- **Not listed**: Registration is refused with `{"error":"name not approved"}`.
### Workflow

View file

@ -8,11 +8,9 @@
# What it does:
# 1. Creates users: disinto-register, disinto-tunnel
# 2. Creates /var/lib/disinto/ with registry.json, registry.lock, allowlist.json
# 3. On upgrade: auto-populates allowlist.json from existing registry entries
# 4. On fresh install: seeds empty allowlist with warning (registration gated)
# 5. Installs Caddy with Gandi DNS plugin
# 6. Sets up SSH authorized_keys for both users
# 7. Installs control plane scripts to /opt/disinto-edge/
# 3. Installs Caddy with Gandi DNS plugin
# 4. Sets up SSH authorized_keys for both users
# 5. Installs control plane scripts to /opt/disinto-edge/
#
# Requirements:
# - Fresh Debian 12 (Bookworm)
@ -160,39 +158,12 @@ LOCK_FILE="${REGISTRY_DIR}/registry.lock"
touch "$LOCK_FILE"
chmod 0644 "$LOCK_FILE"
# Initialize allowlist.json
# Initialize allowlist.json (empty = no restrictions until admin populates)
ALLOWLIST_FILE="${REGISTRY_DIR}/allowlist.json"
_ALLOWLIST_MODE=""
if [ ! -f "$ALLOWLIST_FILE" ]; then
# Check whether the registry already has projects that need allowlisting
_EXISTING_PROJECTS=""
if [ -f "$REGISTRY_FILE" ]; then
_EXISTING_PROJECTS=$(jq -r '.projects // {} | keys[]' "$REGISTRY_FILE" 2>/dev/null) || _EXISTING_PROJECTS=""
fi
if [ -n "$_EXISTING_PROJECTS" ]; then
# Upgrade path: auto-populate allowlist with existing projects (unbound).
# This preserves current behavior — existing tunnels keep working.
# Operator can tighten pubkey bindings later.
_ALLOWED='{}'
_PROJECT_COUNT=0
while IFS= read -r _proj; do
_ALLOWED=$(echo "$_ALLOWED" | jq --arg p "$_proj" '. + {($p): {"pubkey_fingerprint": ""}}')
_PROJECT_COUNT=$((_PROJECT_COUNT + 1))
done <<< "$_EXISTING_PROJECTS"
echo "{\"version\":1,\"allowed\":${_ALLOWED}}" | jq '.' > "$ALLOWLIST_FILE"
chmod 0644 "$ALLOWLIST_FILE"
chown root:root "$ALLOWLIST_FILE"
_ALLOWLIST_MODE="upgraded:${_PROJECT_COUNT}"
log_info "Initialized allowlist with ${_PROJECT_COUNT} existing project(s): ${ALLOWLIST_FILE}"
else
# Fresh install: seed empty allowlist and warn the operator.
echo '{"version":1,"allowed":{}}' > "$ALLOWLIST_FILE"
chmod 0644 "$ALLOWLIST_FILE"
chown root:root "$ALLOWLIST_FILE"
_ALLOWLIST_MODE="fresh-empty"
log_warn "Allowlist seeded empty — no project can register until you add entries to ${ALLOWLIST_FILE}."
fi
echo '{"version":1,"allowed":{}}' > "$ALLOWLIST_FILE"
chmod 0644 "$ALLOWLIST_FILE"
chown root:root "$ALLOWLIST_FILE"
log_info "Initialized allowlist: ${ALLOWLIST_FILE}"
fi
@ -210,7 +181,7 @@ chmod 0750 "$LOG_DIR"
# Touch the log file so it exists from day one
touch "$LOG_FILE"
chmod 0660 "$LOG_FILE"
chmod 0640 "$LOG_FILE"
chown root:disinto-register "$LOG_FILE"
# Install logrotate config (daily rotation, 30 days retention)
@ -223,7 +194,7 @@ ${LOG_FILE} {
delaycompress
missingok
notifempty
create 0660 root disinto-register
create 0640 root disinto-register
copytruncate
}
EOF
@ -469,7 +440,6 @@ echo ""
echo "Configuration:"
echo " Install directory: ${INSTALL_DIR}"
echo " Registry: ${REGISTRY_FILE}"
echo " Allowlist: ${ALLOWLIST_FILE}"
echo " Caddy admin API: http://127.0.0.1:2019"
echo " Operator site blocks: ${EXTRA_DIR}/ (import ${EXTRA_CADDYFILE})"
echo ""
@ -477,23 +447,6 @@ echo "Users:"
echo " disinto-register - SSH forced command (runs ${INSTALL_DIR}/register.sh)"
echo " disinto-tunnel - Reverse tunnel receiver (no shell)"
echo ""
echo "Allowlist:"
case "${_ALLOWLIST_MODE:-}" in
upgraded:*)
echo " Allowlist was auto-populated from existing registry entries."
echo " Existing projects can register without further action."
;;
fresh-empty)
echo " Allowlist is empty — registration is GATED until you add entries."
echo " Edit ${ALLOWLIST_FILE} as root:"
echo ' {"version":1,"allowed":{"myproject":{"pubkey_fingerprint":""}}}'
echo " See ${INSTALL_DIR}/../README.md for the full workflow."
;;
*)
echo " Allowlist already existed (no changes made)."
;;
esac
echo ""
echo "Next steps:"
echo " 1. Verify Caddy is running: systemctl status caddy"
echo " 2. Test SSH access: ssh disinto-register@localhost 'list'"

View file

@ -244,22 +244,23 @@ do_deregister() {
# Record who is deregistering before removal
local deregistered_by="$CALLER"
# Get current port and stored pubkey before removing
local port stored_pubkey pubkey_fp
# Get current port and pubkey before removing
local port pubkey_fp
port=$(get_port "$project")
stored_pubkey=$(get_project_info "$project" | jq -r '.pubkey // empty' 2>/dev/null) || stored_pubkey=""
# Return a single generic error — project nonexistence and ownership
# failure must not be distinguishable to the caller (prevents enumeration).
if [ -z "$port" ] || [ "$caller_pubkey" != "$stored_pubkey" ]; then
# Audit the attempt before we fail so operators can investigate.
pubkey_fp=$(ssh-keygen -lf /dev/stdin <<<"$stored_pubkey" 2>/dev/null | awk '{print $2}') || pubkey_fp="unknown"
audit_log "deregister" "$project" "${port:-unknown}" "$pubkey_fp"
echo '{"error":"deregister denied"}'
if [ -z "$port" ]; then
echo '{"error":"project not found"}'
exit 1
fi
# Verify caller owns this project — pubkey must match stored value
local stored_pubkey
stored_pubkey=$(get_project_info "$project" | jq -r '.pubkey // empty' 2>/dev/null) || stored_pubkey=""
if [ "$caller_pubkey" != "$stored_pubkey" ]; then
echo '{"error":"pubkey mismatch"}'
exit 1
fi
# Compute fingerprint for success-path audit log
pubkey_fp=$(ssh-keygen -lf /dev/stdin <<<"$stored_pubkey" 2>/dev/null | awk '{print $2}') || pubkey_fp="unknown"
# Remove from registry

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 5be020b9de1a719cb331b930cf45caf7559473f7 -->
<!-- last-reviewed: 0d6181918452c1407a3f6bc62917724261acff26 -->
# vault/policies/ — Agent Instructions
HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per