Compare commits
1 commit
c0697ab27b
...
31b5e11006
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
31b5e11006 |
4 changed files with 11 additions and 10 deletions
|
|
@ -113,6 +113,8 @@ The supervisor-run.sh script automatically handles WP agent recovery:
|
|||
- Posts recovery comment with infra-flake context
|
||||
- Avoids duplicate restarts via 5-minute cooldown in history file
|
||||
|
||||
**P0 Memory crisis:**
|
||||
|
||||
**P0 Memory crisis:**
|
||||
# Kill stale one-shot claude processes (>3h old)
|
||||
pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
|
||||
|
|
|
|||
|
|
@ -256,7 +256,7 @@ echo "Status: $_wp_health_status"
|
|||
# Check for gRPC errors in agent logs (last 20 minutes)
|
||||
_wp_grpc_errors=0
|
||||
if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then
|
||||
_wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 | grep -c 'grpc error' || echo "0")
|
||||
_wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 2>/dev/null | grep -c 'grpc error' || echo "0")
|
||||
echo "gRPC errors (last 20m): $_wp_grpc_errors"
|
||||
fi
|
||||
|
||||
|
|
|
|||
|
|
@ -175,8 +175,7 @@ _WP_HEALTH_CHECK_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health-check.md"
|
|||
echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE"
|
||||
|
||||
# Extract WP agent health status from preflight output
|
||||
# Note: match exact "healthy" not "UNHEALTHY" (substring issue)
|
||||
_wp_agent_healthy=$(grep "^WP Agent Health: healthy$" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null && echo "true" || echo "false")
|
||||
_wp_agent_healthy=$(grep "^WP Agent Health:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | grep -q "healthy" && echo "true" || echo "false")
|
||||
_wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
|
||||
|
||||
if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
|
||||
|
|
@ -202,7 +201,7 @@ if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
|
|||
# Restart the WP agent container
|
||||
if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then
|
||||
_restart_time=$(date -u '+%Y-%m-%d %H:%M UTC')
|
||||
log "Successfully restarted WP agent container: $WP_AGENT_CONTAINER_NAME"
|
||||
log "Successfully restarted WP agent container: $_wp_agent_healthy"
|
||||
|
||||
# Update history file
|
||||
echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE"
|
||||
|
|
@ -307,7 +306,7 @@ EOF
|
|||
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${FORGE_API}/issues/$_issue_num/comments" \
|
||||
-d "$(jq -n --arg body "$_recovery_comment" '{body: $body}')" >/dev/null 2>&1 || true
|
||||
-d "{\"body\":$_recovery_comment}" >/dev/null 2>&1 || true
|
||||
|
||||
log "Recovered issue #$_issue_num - returned to pool"
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -151,9 +151,9 @@ _kv_put_secret() {
|
|||
-X POST \
|
||||
-d "$payload" \
|
||||
-o "$tmpfile" \
|
||||
"${VAULT_ADDR}/v1/${VAULT_KV_MOUNT:-kv}/data/${path}")" || {
|
||||
"${VAULT_ADDR}/v1/kv/data/${path}")" || {
|
||||
rm -f "$tmpfile"
|
||||
_err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: curl error"
|
||||
_err "Failed to write to Vault at kv/data/${path}: curl error"
|
||||
return 1
|
||||
}
|
||||
rm -f "$tmpfile"
|
||||
|
|
@ -164,15 +164,15 @@ _kv_put_secret() {
|
|||
return 0
|
||||
;;
|
||||
404)
|
||||
_err "KV path not found: ${VAULT_KV_MOUNT:-kv}/data/${path}"
|
||||
_err "KV path not found: kv/data/${path}"
|
||||
return 1
|
||||
;;
|
||||
403)
|
||||
_err "Permission denied writing to ${VAULT_KV_MOUNT:-kv}/data/${path}"
|
||||
_err "Permission denied writing to kv/data/${path}"
|
||||
return 1
|
||||
;;
|
||||
*)
|
||||
_err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: HTTP $http_code"
|
||||
_err "Failed to write to Vault at kv/data/${path}: HTTP $http_code"
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue