From 567dc4bde0ac6df5c5ed1ea0057943d0755b86b4 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 17 Mar 2026 09:31:49 +0000 Subject: [PATCH] fix: address review findings for supervisor metrics (#24) - planner: filter CI and dev metrics by project name to prevent cross-project pollution - planner: replace fragile awk JSONL filter with jq select() - supervisor: add codeberg_count_paginated() helper; replace hardcoded limit=50 dev-metric API calls with paginated counts so projects with >50 issues report accurate blocked-ratio data - supervisor: add 24h age filter to CI metric SQL query so stale pipelines are not re-emitted with a fresh timestamp - supervisor: replace fragile awk key-order-dependent JSON filter in rotate_metrics() with jq select(); add safety guard to prevent overwriting file with empty result on parse failure - supervisor: move mkdir -p for metrics dir to startup (once) instead of every emit_metric() call - supervisor: guard _RAM_TOTAL_MB against empty value in bash arithmetic Co-Authored-By: Claude Sonnet 4.6 --- planner/planner-agent.sh | 14 ++++++------ supervisor/supervisor-poll.sh | 41 ++++++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/planner/planner-agent.sh b/planner/planner-agent.sh index 1252c6a..38a8cb1 100755 --- a/planner/planner-agent.sh +++ b/planner/planner-agent.sh @@ -195,15 +195,15 @@ METRICS_FILE="${FACTORY_ROOT}/metrics/supervisor-metrics.jsonl" METRICS_SUMMARY="(no metrics data — supervisor has not yet written metrics)" if [ -f "$METRICS_FILE" ] && [ -s "$METRICS_FILE" ]; then _METRICS_CUTOFF=$(date -u -d '7 days ago' +%Y-%m-%dT%H:%M) - METRICS_SUMMARY=$(awk -F'"' -v cutoff="$_METRICS_CUTOFF" \ - 'NF >= 4 && $2 == "ts" && $4 >= cutoff' "$METRICS_FILE" 2>/dev/null | \ - jq -rs ' - ( [.[] | select(.type=="ci") | .duration_min] | if length>0 then add/length|round else null end ) as $ci_avg | - ( [.[] | select(.type=="ci") | select(.status=="success")] | length ) as $ci_ok | - ( [.[] | select(.type=="ci")] | length ) as $ci_n | + METRICS_SUMMARY=$(jq -c --arg cutoff "$_METRICS_CUTOFF" 'select(.ts >= $cutoff)' \ + "$METRICS_FILE" 2>/dev/null | \ + jq -rs --arg proj "${PROJECT_NAME:-}" ' + ( [.[] | select(.type=="ci" and .project==$proj) | .duration_min] | if length>0 then add/length|round else null end ) as $ci_avg | + ( [.[] | select(.type=="ci" and .project==$proj) | select(.status=="success")] | length ) as $ci_ok | + ( [.[] | select(.type=="ci" and .project==$proj)] | length ) as $ci_n | ( [.[] | select(.type=="infra") | .ram_used_pct] | if length>0 then add/length|round else null end ) as $ram_avg | ( [.[] | select(.type=="infra") | .disk_used_pct] | if length>0 then add/length|round else null end ) as $disk_avg | - ( [.[] | select(.type=="dev")] | last ) as $dev_last | + ( [.[] | select(.type=="dev" and .project==$proj)] | last ) as $dev_last | "CI (\($ci_n) pipelines): avg \(if $ci_avg then "\($ci_avg)min" else "n/a" end), success rate \(if $ci_n > 0 then "\($ci_ok * 100 / $ci_n | round)%" else "n/a" end)\n" + "Infra: avg RAM \(if $ram_avg then "\($ram_avg)%" else "n/a" end) used, avg disk \(if $disk_avg then "\($disk_avg)%" else "n/a" end) used\n" + "Dev (latest): \(if $dev_last then "\($dev_last.issues_in_backlog) in backlog, \($dev_last.issues_blocked) blocked (\(if $dev_last.issues_in_backlog > 0 then $dev_last.issues_blocked * 100 / $dev_last.issues_in_backlog | round else 0 end)% blocked), \($dev_last.pr_open) open PRs" else "n/a" end) diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index e6484cc..6e9169c 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -23,17 +23,37 @@ PROJECTS_DIR="${FACTORY_ROOT}/projects" METRICS_FILE="${FACTORY_ROOT}/metrics/supervisor-metrics.jsonl" emit_metric() { - mkdir -p "$(dirname "$METRICS_FILE")" printf '%s\n' "$1" >> "$METRICS_FILE" } +# Count all matching items from a paginated Codeberg API endpoint. +# Usage: codeberg_count_paginated "/issues?state=open&labels=backlog&type=issues" +# Returns total count across all pages (max 20 pages = 1000 items). +codeberg_count_paginated() { + local endpoint="$1" total=0 page=1 count + while true; do + count=$(codeberg_api GET "${endpoint}&limit=50&page=${page}" 2>/dev/null | jq 'length' 2>/dev/null || echo 0) + total=$((total + ${count:-0})) + [ "${count:-0}" -lt 50 ] && break + page=$((page + 1)) + [ "$page" -gt 20 ] && break + done + echo "$total" +} + rotate_metrics() { [ -f "$METRICS_FILE" ] || return 0 local cutoff tmpfile cutoff=$(date -u -d '30 days ago' +%Y-%m-%dT%H:%M) tmpfile="${METRICS_FILE}.tmp" - awk -F'"' -v cutoff="$cutoff" 'NF >= 4 && $2 == "ts" && $4 >= cutoff' \ - "$METRICS_FILE" > "$tmpfile" && mv "$tmpfile" "$METRICS_FILE" || rm -f "$tmpfile" + jq -c --arg cutoff "$cutoff" 'select(.ts >= $cutoff)' \ + "$METRICS_FILE" > "$tmpfile" 2>/dev/null + # Only replace if jq produced output, or the source is already empty + if [ -s "$tmpfile" ] || [ ! -s "$METRICS_FILE" ]; then + mv "$tmpfile" "$METRICS_FILE" + else + rm -f "$tmpfile" + fi } # Prevent overlapping runs @@ -46,6 +66,7 @@ if [ -f "$LOCKFILE" ]; then fi echo $$ > "$LOCKFILE" trap 'rm -f "$LOCKFILE" "$STATUSFILE"' EXIT +mkdir -p "$(dirname "$METRICS_FILE")" rotate_metrics flog() { @@ -164,7 +185,7 @@ fi # Emit infra metric _RAM_TOTAL_MB=$(free -m | awk '/Mem:/{print $2}') -_RAM_USED_PCT=$(( _RAM_TOTAL_MB > 0 ? (_RAM_TOTAL_MB - AVAIL_MB) * 100 / _RAM_TOTAL_MB : 0 )) +_RAM_USED_PCT=$(( ${_RAM_TOTAL_MB:-0} > 0 ? (${_RAM_TOTAL_MB:-0} - ${AVAIL_MB:-0}) * 100 / ${_RAM_TOTAL_MB:-1} : 0 )) emit_metric "$(jq -nc \ --arg ts "$(date -u +%Y-%m-%dT%H:%MZ)" \ --argjson ram "${_RAM_USED_PCT:-0}" \ @@ -224,8 +245,8 @@ check_project() { PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs || true) [ "${PENDING_CI:-0}" -gt 0 ] && p2 "${proj_name}: CI: ${PENDING_CI} pipeline(s) pending >30min" - # Emit CI metric (last completed pipeline) - _CI_ROW=$(wpdb -A -F ',' -c "SELECT id, COALESCE(ROUND(EXTRACT(EPOCH FROM (to_timestamp(finished) - to_timestamp(started)))/60)::int, 0), status FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status IN ('success','failure','error') AND finished > 0 ORDER BY id DESC LIMIT 1;" 2>/dev/null | grep -E '^[0-9]' | head -1 || true) + # Emit CI metric (last completed pipeline within 24h — skip if project has no recent CI) + _CI_ROW=$(wpdb -A -F ',' -c "SELECT id, COALESCE(ROUND(EXTRACT(EPOCH FROM (to_timestamp(finished) - to_timestamp(started)))/60)::int, 0), status FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status IN ('success','failure','error') AND finished > 0 AND to_timestamp(finished) > now() - interval '24 hours' ORDER BY id DESC LIMIT 1;" 2>/dev/null | grep -E '^[0-9]' | head -1 || true) if [ -n "$_CI_ROW" ]; then _CI_ID=$(echo "$_CI_ROW" | cut -d',' -f1 | tr -d ' ') _CI_DUR=$(echo "$_CI_ROW" | cut -d',' -f2 | tr -d ' ') @@ -467,10 +488,10 @@ check_project() { unset DEPS_OF BACKLOG_NUMS NODE_COLOR SEEN_CYCLES DEP_CACHE fi - # Emit dev metric - _BACKLOG_COUNT=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=50" 2>/dev/null | jq 'length' 2>/dev/null || echo 0) - _BLOCKED_COUNT=$(codeberg_api GET "/issues?state=open&labels=blocked&type=issues&limit=50" 2>/dev/null | jq 'length' 2>/dev/null || echo 0) - _PR_COUNT=$(codeberg_api GET "/pulls?state=open&limit=50" 2>/dev/null | jq 'length' 2>/dev/null || echo 0) + # Emit dev metric (paginated to avoid silent cap at 50) + _BACKLOG_COUNT=$(codeberg_count_paginated "/issues?state=open&labels=backlog&type=issues") + _BLOCKED_COUNT=$(codeberg_count_paginated "/issues?state=open&labels=blocked&type=issues") + _PR_COUNT=$(codeberg_count_paginated "/pulls?state=open") emit_metric "$(jq -nc \ --arg ts "$(date -u +%Y-%m-%dT%H:%MZ)" \ --arg proj "$proj_name" \