Compare commits

..

4 commits

Author SHA1 Message Date
Agent
c0697ab27b fix: incident: WP gRPC flake burned dev-qwen CI retry budget on #842 (2026-04-16) (#867)
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/pr/ci Pipeline was successful
2026-04-17 01:34:41 +00:00
Agent
04ead1fbdc fix: incident: WP gRPC flake burned dev-qwen CI retry budget on #842 (2026-04-16) (#867) 2026-04-17 01:34:41 +00:00
c3e58e88ed Merge pull request 'fix: tech-debt: tools/vault-import.sh uses hardcoded secret/ KV mount (#910)' (#932) from fix/issue-910 into main
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
2026-04-17 01:31:10 +00:00
dev-qwen2
99d3cb4c8f fix: tech-debt: tools/vault-import.sh uses hardcoded secret/ KV mount (#910)
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/pr/ci Pipeline was successful
ci/woodpecker/pr/secret-scan Pipeline was successful
2026-04-17 01:18:03 +00:00
4 changed files with 10 additions and 11 deletions

View file

@ -113,8 +113,6 @@ The supervisor-run.sh script automatically handles WP agent recovery:
- Posts recovery comment with infra-flake context - Posts recovery comment with infra-flake context
- Avoids duplicate restarts via 5-minute cooldown in history file - Avoids duplicate restarts via 5-minute cooldown in history file
**P0 Memory crisis:**
**P0 Memory crisis:** **P0 Memory crisis:**
# Kill stale one-shot claude processes (>3h old) # Kill stale one-shot claude processes (>3h old)
pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true

View file

@ -256,7 +256,7 @@ echo "Status: $_wp_health_status"
# Check for gRPC errors in agent logs (last 20 minutes) # Check for gRPC errors in agent logs (last 20 minutes)
_wp_grpc_errors=0 _wp_grpc_errors=0
if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then
_wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 2>/dev/null | grep -c 'grpc error' || echo "0") _wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 | grep -c 'grpc error' || echo "0")
echo "gRPC errors (last 20m): $_wp_grpc_errors" echo "gRPC errors (last 20m): $_wp_grpc_errors"
fi fi

View file

@ -175,7 +175,8 @@ _WP_HEALTH_CHECK_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health-check.md"
echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE" echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE"
# Extract WP agent health status from preflight output # Extract WP agent health status from preflight output
_wp_agent_healthy=$(grep "^WP Agent Health:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | grep -q "healthy" && echo "true" || echo "false") # Note: match exact "healthy" not "UNHEALTHY" (substring issue)
_wp_agent_healthy=$(grep "^WP Agent Health: healthy$" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null && echo "true" || echo "false")
_wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "") _wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
@ -201,7 +202,7 @@ if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
# Restart the WP agent container # Restart the WP agent container
if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then
_restart_time=$(date -u '+%Y-%m-%d %H:%M UTC') _restart_time=$(date -u '+%Y-%m-%d %H:%M UTC')
log "Successfully restarted WP agent container: $_wp_agent_healthy" log "Successfully restarted WP agent container: $WP_AGENT_CONTAINER_NAME"
# Update history file # Update history file
echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE" echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE"
@ -306,7 +307,7 @@ EOF
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \ -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
"${FORGE_API}/issues/$_issue_num/comments" \ "${FORGE_API}/issues/$_issue_num/comments" \
-d "{\"body\":$_recovery_comment}" >/dev/null 2>&1 || true -d "$(jq -n --arg body "$_recovery_comment" '{body: $body}')" >/dev/null 2>&1 || true
log "Recovered issue #$_issue_num - returned to pool" log "Recovered issue #$_issue_num - returned to pool"
fi fi

View file

@ -151,9 +151,9 @@ _kv_put_secret() {
-X POST \ -X POST \
-d "$payload" \ -d "$payload" \
-o "$tmpfile" \ -o "$tmpfile" \
"${VAULT_ADDR}/v1/kv/data/${path}")" || { "${VAULT_ADDR}/v1/${VAULT_KV_MOUNT:-kv}/data/${path}")" || {
rm -f "$tmpfile" rm -f "$tmpfile"
_err "Failed to write to Vault at kv/data/${path}: curl error" _err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: curl error"
return 1 return 1
} }
rm -f "$tmpfile" rm -f "$tmpfile"
@ -164,15 +164,15 @@ _kv_put_secret() {
return 0 return 0
;; ;;
404) 404)
_err "KV path not found: kv/data/${path}" _err "KV path not found: ${VAULT_KV_MOUNT:-kv}/data/${path}"
return 1 return 1
;; ;;
403) 403)
_err "Permission denied writing to kv/data/${path}" _err "Permission denied writing to ${VAULT_KV_MOUNT:-kv}/data/${path}"
return 1 return 1
;; ;;
*) *)
_err "Failed to write to Vault at kv/data/${path}: HTTP $http_code" _err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: HTTP $http_code"
return 1 return 1
;; ;;
esac esac