diff --git a/planner/planner-agent.sh b/planner/planner-agent.sh index 24acdd9..1252c6a 100755 --- a/planner/planner-agent.sh +++ b/planner/planner-agent.sh @@ -190,6 +190,27 @@ OPEN_SUMMARY=$(echo "$OPEN_ISSUES" | jq -r '.[] | "#\(.number) [\(.labels | map( # Fetch vision-labeled issues specifically VISION_ISSUES=$(echo "$OPEN_ISSUES" | jq -r '.[] | select(.labels | map(.name) | index("vision")) | "#\(.number) \(.title)\n\(.body)"' 2>/dev/null || true) +# Read supervisor metrics for trend analysis (last 7 days) +METRICS_FILE="${FACTORY_ROOT}/metrics/supervisor-metrics.jsonl" +METRICS_SUMMARY="(no metrics data — supervisor has not yet written metrics)" +if [ -f "$METRICS_FILE" ] && [ -s "$METRICS_FILE" ]; then + _METRICS_CUTOFF=$(date -u -d '7 days ago' +%Y-%m-%dT%H:%M) + METRICS_SUMMARY=$(awk -F'"' -v cutoff="$_METRICS_CUTOFF" \ + 'NF >= 4 && $2 == "ts" && $4 >= cutoff' "$METRICS_FILE" 2>/dev/null | \ + jq -rs ' + ( [.[] | select(.type=="ci") | .duration_min] | if length>0 then add/length|round else null end ) as $ci_avg | + ( [.[] | select(.type=="ci") | select(.status=="success")] | length ) as $ci_ok | + ( [.[] | select(.type=="ci")] | length ) as $ci_n | + ( [.[] | select(.type=="infra") | .ram_used_pct] | if length>0 then add/length|round else null end ) as $ram_avg | + ( [.[] | select(.type=="infra") | .disk_used_pct] | if length>0 then add/length|round else null end ) as $disk_avg | + ( [.[] | select(.type=="dev")] | last ) as $dev_last | + "CI (\($ci_n) pipelines): avg \(if $ci_avg then "\($ci_avg)min" else "n/a" end), success rate \(if $ci_n > 0 then "\($ci_ok * 100 / $ci_n | round)%" else "n/a" end)\n" + + "Infra: avg RAM \(if $ram_avg then "\($ram_avg)%" else "n/a" end) used, avg disk \(if $disk_avg then "\($disk_avg)%" else "n/a" end) used\n" + + "Dev (latest): \(if $dev_last then "\($dev_last.issues_in_backlog) in backlog, \($dev_last.issues_blocked) blocked (\(if $dev_last.issues_in_backlog > 0 then $dev_last.issues_blocked * 100 / $dev_last.issues_in_backlog | round else 0 end)% blocked), \($dev_last.pr_open) open PRs" else "n/a" end) + ' 2>/dev/null) || METRICS_SUMMARY="(metrics parse error)" + log "Metrics: ${METRICS_SUMMARY:0:120}" +fi + PHASE2_PROMPT="You are the planner for ${CODEBERG_REPO}. Your job: find gaps between the project vision and current reality. ## VISION.md (human-maintained goals) @@ -204,6 +225,9 @@ ${VISION_ISSUES:-"(none)"} ## All open issues ${OPEN_SUMMARY} +## Operational metrics (last 7 days from supervisor) +${METRICS_SUMMARY} + ## Task Identify gaps — things implied by VISION.md that are neither reflected in the project state nor covered by an existing open issue. @@ -218,6 +242,7 @@ For each gap, output a JSON object (one per line, no array wrapper): - Each title should be a plain, action-oriented sentence - Each body should explain: what's missing, why it matters for the vision, rough approach - Reference blocking issues by number in depends array +- When metrics indicate a systemic problem conflicting with VISION.md (slow CI, high blocked ratio, disk pressure), create an optimization issue even if not explicitly in VISION.md If there are no gaps, output exactly: NO_GAPS diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index aae8c3b..e6484cc 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -20,6 +20,22 @@ LOCKFILE="/tmp/supervisor-poll.lock" PROMPT_FILE="${FACTORY_ROOT}/supervisor/PROMPT.md" PROJECTS_DIR="${FACTORY_ROOT}/projects" +METRICS_FILE="${FACTORY_ROOT}/metrics/supervisor-metrics.jsonl" + +emit_metric() { + mkdir -p "$(dirname "$METRICS_FILE")" + printf '%s\n' "$1" >> "$METRICS_FILE" +} + +rotate_metrics() { + [ -f "$METRICS_FILE" ] || return 0 + local cutoff tmpfile + cutoff=$(date -u -d '30 days ago' +%Y-%m-%dT%H:%M) + tmpfile="${METRICS_FILE}.tmp" + awk -F'"' -v cutoff="$cutoff" 'NF >= 4 && $2 == "ts" && $4 >= cutoff' \ + "$METRICS_FILE" > "$tmpfile" && mv "$tmpfile" "$METRICS_FILE" || rm -f "$tmpfile" +} + # Prevent overlapping runs if [ -f "$LOCKFILE" ]; then LOCK_PID=$(cat "$LOCKFILE" 2>/dev/null) @@ -30,6 +46,7 @@ if [ -f "$LOCKFILE" ]; then fi echo $$ > "$LOCKFILE" trap 'rm -f "$LOCKFILE" "$STATUSFILE"' EXIT +rotate_metrics flog() { printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE" @@ -145,6 +162,16 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then fi fi +# Emit infra metric +_RAM_TOTAL_MB=$(free -m | awk '/Mem:/{print $2}') +_RAM_USED_PCT=$(( _RAM_TOTAL_MB > 0 ? (_RAM_TOTAL_MB - AVAIL_MB) * 100 / _RAM_TOTAL_MB : 0 )) +emit_metric "$(jq -nc \ + --arg ts "$(date -u +%Y-%m-%dT%H:%MZ)" \ + --argjson ram "${_RAM_USED_PCT:-0}" \ + --argjson disk "${DISK_PERCENT:-0}" \ + --argjson swap "${SWAP_USED_MB:-0}" \ + '{ts:$ts,type:"infra",ram_used_pct:$ram,disk_used_pct:$disk,swap_mb:$swap}' 2>/dev/null)" 2>/dev/null || true + # ============================================================================= # P4-INFRA: HOUSEKEEPING — stale processes, log rotation (project-agnostic) # ============================================================================= @@ -197,6 +224,21 @@ check_project() { PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs || true) [ "${PENDING_CI:-0}" -gt 0 ] && p2 "${proj_name}: CI: ${PENDING_CI} pipeline(s) pending >30min" + # Emit CI metric (last completed pipeline) + _CI_ROW=$(wpdb -A -F ',' -c "SELECT id, COALESCE(ROUND(EXTRACT(EPOCH FROM (to_timestamp(finished) - to_timestamp(started)))/60)::int, 0), status FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status IN ('success','failure','error') AND finished > 0 ORDER BY id DESC LIMIT 1;" 2>/dev/null | grep -E '^[0-9]' | head -1 || true) + if [ -n "$_CI_ROW" ]; then + _CI_ID=$(echo "$_CI_ROW" | cut -d',' -f1 | tr -d ' ') + _CI_DUR=$(echo "$_CI_ROW" | cut -d',' -f2 | tr -d ' ') + _CI_STAT=$(echo "$_CI_ROW" | cut -d',' -f3 | tr -d ' ') + emit_metric "$(jq -nc \ + --arg ts "$(date -u +%Y-%m-%dT%H:%MZ)" \ + --arg proj "$proj_name" \ + --argjson pipeline "${_CI_ID:-0}" \ + --argjson duration "${_CI_DUR:-0}" \ + --arg status "${_CI_STAT:-unknown}" \ + '{ts:$ts,type:"ci",project:$proj,pipeline:$pipeline,duration_min:$duration,status:$status}' 2>/dev/null)" 2>/dev/null || true + fi + # Dev-agent health (only if monitoring enabled) if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then DEV_LOCK="/tmp/dev-agent.lock" @@ -425,6 +467,18 @@ check_project() { unset DEPS_OF BACKLOG_NUMS NODE_COLOR SEEN_CYCLES DEP_CACHE fi + # Emit dev metric + _BACKLOG_COUNT=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=50" 2>/dev/null | jq 'length' 2>/dev/null || echo 0) + _BLOCKED_COUNT=$(codeberg_api GET "/issues?state=open&labels=blocked&type=issues&limit=50" 2>/dev/null | jq 'length' 2>/dev/null || echo 0) + _PR_COUNT=$(codeberg_api GET "/pulls?state=open&limit=50" 2>/dev/null | jq 'length' 2>/dev/null || echo 0) + emit_metric "$(jq -nc \ + --arg ts "$(date -u +%Y-%m-%dT%H:%MZ)" \ + --arg proj "$proj_name" \ + --argjson backlog "${_BACKLOG_COUNT:-0}" \ + --argjson blocked "${_BLOCKED_COUNT:-0}" \ + --argjson prs "${_PR_COUNT:-0}" \ + '{ts:$ts,type:"dev",project:$proj,issues_in_backlog:$backlog,issues_blocked:$blocked,pr_open:$prs}' 2>/dev/null)" 2>/dev/null || true + # =========================================================================== # P4-PROJECT: Clean stale worktrees for this project # ===========================================================================