diff --git a/formulas/run-supervisor.toml b/formulas/run-supervisor.toml index e623187..4101252 100644 --- a/formulas/run-supervisor.toml +++ b/formulas/run-supervisor.toml @@ -113,8 +113,6 @@ The supervisor-run.sh script automatically handles WP agent recovery: - Posts recovery comment with infra-flake context - Avoids duplicate restarts via 5-minute cooldown in history file -**P0 Memory crisis:** - **P0 Memory crisis:** # Kill stale one-shot claude processes (>3h old) pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true diff --git a/supervisor/preflight.sh b/supervisor/preflight.sh index 8430ba1..ee42c66 100755 --- a/supervisor/preflight.sh +++ b/supervisor/preflight.sh @@ -256,7 +256,7 @@ echo "Status: $_wp_health_status" # Check for gRPC errors in agent logs (last 20 minutes) _wp_grpc_errors=0 if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then - _wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 2>/dev/null | grep -c 'grpc error' || echo "0") + _wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 | grep -c 'grpc error' || echo "0") echo "gRPC errors (last 20m): $_wp_grpc_errors" fi diff --git a/supervisor/supervisor-run.sh b/supervisor/supervisor-run.sh index 71df539..df644a6 100755 --- a/supervisor/supervisor-run.sh +++ b/supervisor/supervisor-run.sh @@ -175,7 +175,8 @@ _WP_HEALTH_CHECK_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health-check.md" echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE" # Extract WP agent health status from preflight output -_wp_agent_healthy=$(grep "^WP Agent Health:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | grep -q "healthy" && echo "true" || echo "false") +# Note: match exact "healthy" not "UNHEALTHY" (substring issue) +_wp_agent_healthy=$(grep "^WP Agent Health: healthy$" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null && echo "true" || echo "false") _wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "") if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then @@ -201,7 +202,7 @@ if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then # Restart the WP agent container if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then _restart_time=$(date -u '+%Y-%m-%d %H:%M UTC') - log "Successfully restarted WP agent container: $_wp_agent_healthy" + log "Successfully restarted WP agent container: $WP_AGENT_CONTAINER_NAME" # Update history file echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE" @@ -306,7 +307,7 @@ EOF -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \ -H "Content-Type: application/json" \ "${FORGE_API}/issues/$_issue_num/comments" \ - -d "{\"body\":$_recovery_comment}" >/dev/null 2>&1 || true + -d "$(jq -n --arg body "$_recovery_comment" '{body: $body}')" >/dev/null 2>&1 || true log "Recovered issue #$_issue_num - returned to pool" fi diff --git a/tools/vault-import.sh b/tools/vault-import.sh index bea4a07..f85dd16 100755 --- a/tools/vault-import.sh +++ b/tools/vault-import.sh @@ -151,9 +151,9 @@ _kv_put_secret() { -X POST \ -d "$payload" \ -o "$tmpfile" \ - "${VAULT_ADDR}/v1/kv/data/${path}")" || { + "${VAULT_ADDR}/v1/${VAULT_KV_MOUNT:-kv}/data/${path}")" || { rm -f "$tmpfile" - _err "Failed to write to Vault at kv/data/${path}: curl error" + _err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: curl error" return 1 } rm -f "$tmpfile" @@ -164,15 +164,15 @@ _kv_put_secret() { return 0 ;; 404) - _err "KV path not found: kv/data/${path}" + _err "KV path not found: ${VAULT_KV_MOUNT:-kv}/data/${path}" return 1 ;; 403) - _err "Permission denied writing to kv/data/${path}" + _err "Permission denied writing to ${VAULT_KV_MOUNT:-kv}/data/${path}" return 1 ;; *) - _err "Failed to write to Vault at kv/data/${path}: HTTP $http_code" + _err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: HTTP $http_code" return 1 ;; esac