Compare commits
4 commits
31b5e11006
...
c0697ab27b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c0697ab27b | ||
|
|
04ead1fbdc | ||
| c3e58e88ed | |||
|
|
99d3cb4c8f |
4 changed files with 10 additions and 11 deletions
|
|
@ -113,8 +113,6 @@ The supervisor-run.sh script automatically handles WP agent recovery:
|
||||||
- Posts recovery comment with infra-flake context
|
- Posts recovery comment with infra-flake context
|
||||||
- Avoids duplicate restarts via 5-minute cooldown in history file
|
- Avoids duplicate restarts via 5-minute cooldown in history file
|
||||||
|
|
||||||
**P0 Memory crisis:**
|
|
||||||
|
|
||||||
**P0 Memory crisis:**
|
**P0 Memory crisis:**
|
||||||
# Kill stale one-shot claude processes (>3h old)
|
# Kill stale one-shot claude processes (>3h old)
|
||||||
pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
|
pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
|
||||||
|
|
|
||||||
|
|
@ -256,7 +256,7 @@ echo "Status: $_wp_health_status"
|
||||||
# Check for gRPC errors in agent logs (last 20 minutes)
|
# Check for gRPC errors in agent logs (last 20 minutes)
|
||||||
_wp_grpc_errors=0
|
_wp_grpc_errors=0
|
||||||
if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then
|
if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then
|
||||||
_wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 2>/dev/null | grep -c 'grpc error' || echo "0")
|
_wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 | grep -c 'grpc error' || echo "0")
|
||||||
echo "gRPC errors (last 20m): $_wp_grpc_errors"
|
echo "gRPC errors (last 20m): $_wp_grpc_errors"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -175,7 +175,8 @@ _WP_HEALTH_CHECK_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health-check.md"
|
||||||
echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE"
|
echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE"
|
||||||
|
|
||||||
# Extract WP agent health status from preflight output
|
# Extract WP agent health status from preflight output
|
||||||
_wp_agent_healthy=$(grep "^WP Agent Health:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | grep -q "healthy" && echo "true" || echo "false")
|
# Note: match exact "healthy" not "UNHEALTHY" (substring issue)
|
||||||
|
_wp_agent_healthy=$(grep "^WP Agent Health: healthy$" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null && echo "true" || echo "false")
|
||||||
_wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
|
_wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
|
||||||
|
|
||||||
if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
|
if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
|
||||||
|
|
@ -201,7 +202,7 @@ if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
|
||||||
# Restart the WP agent container
|
# Restart the WP agent container
|
||||||
if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then
|
if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then
|
||||||
_restart_time=$(date -u '+%Y-%m-%d %H:%M UTC')
|
_restart_time=$(date -u '+%Y-%m-%d %H:%M UTC')
|
||||||
log "Successfully restarted WP agent container: $_wp_agent_healthy"
|
log "Successfully restarted WP agent container: $WP_AGENT_CONTAINER_NAME"
|
||||||
|
|
||||||
# Update history file
|
# Update history file
|
||||||
echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE"
|
echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE"
|
||||||
|
|
@ -306,7 +307,7 @@ EOF
|
||||||
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
|
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
"${FORGE_API}/issues/$_issue_num/comments" \
|
"${FORGE_API}/issues/$_issue_num/comments" \
|
||||||
-d "{\"body\":$_recovery_comment}" >/dev/null 2>&1 || true
|
-d "$(jq -n --arg body "$_recovery_comment" '{body: $body}')" >/dev/null 2>&1 || true
|
||||||
|
|
||||||
log "Recovered issue #$_issue_num - returned to pool"
|
log "Recovered issue #$_issue_num - returned to pool"
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
|
|
@ -151,9 +151,9 @@ _kv_put_secret() {
|
||||||
-X POST \
|
-X POST \
|
||||||
-d "$payload" \
|
-d "$payload" \
|
||||||
-o "$tmpfile" \
|
-o "$tmpfile" \
|
||||||
"${VAULT_ADDR}/v1/kv/data/${path}")" || {
|
"${VAULT_ADDR}/v1/${VAULT_KV_MOUNT:-kv}/data/${path}")" || {
|
||||||
rm -f "$tmpfile"
|
rm -f "$tmpfile"
|
||||||
_err "Failed to write to Vault at kv/data/${path}: curl error"
|
_err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: curl error"
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
rm -f "$tmpfile"
|
rm -f "$tmpfile"
|
||||||
|
|
@ -164,15 +164,15 @@ _kv_put_secret() {
|
||||||
return 0
|
return 0
|
||||||
;;
|
;;
|
||||||
404)
|
404)
|
||||||
_err "KV path not found: kv/data/${path}"
|
_err "KV path not found: ${VAULT_KV_MOUNT:-kv}/data/${path}"
|
||||||
return 1
|
return 1
|
||||||
;;
|
;;
|
||||||
403)
|
403)
|
||||||
_err "Permission denied writing to kv/data/${path}"
|
_err "Permission denied writing to ${VAULT_KV_MOUNT:-kv}/data/${path}"
|
||||||
return 1
|
return 1
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
_err "Failed to write to Vault at kv/data/${path}: HTTP $http_code"
|
_err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: HTTP $http_code"
|
||||||
return 1
|
return 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue