fix: [nomad-prep] P5 — add healthchecks to agents, edge, staging, woodpecker-agent (#794)
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/pr/ci Pipeline was successful
ci/woodpecker/pr/smoke-init Pipeline was successful

Add Docker healthcheck blocks so Nomad check stanzas map 1:1 at migration:

- agents / agents-llama: pgrep -f entrypoint.sh (60s interval)
- woodpecker-agent: wget healthz on :3333 (30s interval)
- edge: curl Caddy admin API on :2019 (30s interval)
- staging: wget Caddy admin API on :2019 (30s interval)
- chat: add /health endpoint to server.py (no-auth 200 OK), fix
  Dockerfile HEALTHCHECK to use it, add compose-level healthcheck

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude 2026-04-15 19:39:35 +00:00
parent 3b366ad96e
commit 8799a8c676
4 changed files with 63 additions and 1 deletions

View file

@ -320,6 +320,12 @@ services:
WOODPECKER_HEALTHCHECK_ADDR: ":3333"
WOODPECKER_BACKEND_DOCKER_NETWORK: disinto_disinto-net
WOODPECKER_MAX_WORKFLOWS: 1
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3333/healthz"]
interval: 30s
timeout: 5s
retries: 3
start_period: 15s
depends_on:
- woodpecker
@ -374,6 +380,12 @@ services:
# Vault-only secrets (GITHUB_TOKEN, CLAWHUB_TOKEN, deploy keys) live in
# secrets/*.enc and are NEVER injected here — only the runner
# container receives them at fire time (AD-006, #745, #777).
healthcheck:
test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
interval: 60s
timeout: 5s
retries: 3
start_period: 30s
depends_on:
forgejo:
condition: service_healthy
@ -428,6 +440,12 @@ COMPOSEEOF
CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
POLL_INTERVAL: ${POLL_INTERVAL:-300}
AGENT_ROLES: dev
healthcheck:
test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
interval: 60s
timeout: 5s
retries: 3
start_period: 30s
depends_on:
forgejo:
condition: service_healthy
@ -499,6 +517,12 @@ LLAMAEOF
- ./secrets/tunnel_key:/run/secrets/tunnel_key:ro
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- ${HOME}/.claude.json:/home/agent/.claude.json:ro
healthcheck:
test: ["CMD", "curl", "-fsS", "http://localhost:2019/config/"]
interval: 30s
timeout: 5s
retries: 3
start_period: 15s
depends_on:
forgejo:
condition: service_healthy
@ -516,6 +540,12 @@ LLAMAEOF
command: ["caddy", "file-server", "--root", "/srv/site"]
security_opt:
- apparmor=unconfined
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:2019/config/"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
volumes:
- ./docker:/srv/site:ro
networks:
@ -575,6 +605,12 @@ LLAMAEOF
CHAT_MAX_REQUESTS_PER_HOUR: ${CHAT_MAX_REQUESTS_PER_HOUR:-60}
CHAT_MAX_REQUESTS_PER_DAY: ${CHAT_MAX_REQUESTS_PER_DAY:-500}
CHAT_MAX_TOKENS_PER_DAY: ${CHAT_MAX_TOKENS_PER_DAY:-1000000}
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
networks:
- disinto-net