#!/usr/bin/env bash # ============================================================================= # collect-engagement.sh — Parse Caddy access logs into engagement evidence # # Reads Caddy's structured JSON access log, extracts visitor engagement data # for the last 24 hours, and writes a dated JSON report to evidence/engagement/. # # The planner consumes these reports to close the build→ship→learn loop: # an addressable (disinto.ai) becomes observable when engagement data flows back. # # Usage: # bash site/collect-engagement.sh # # Cron: 55 23 * * * cd /home/debian/dark-factory && bash site/collect-engagement.sh # ============================================================================= set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" FACTORY_ROOT="$(dirname "$SCRIPT_DIR")" # shellcheck source=../lib/env.sh source "$FACTORY_ROOT/lib/env.sh" LOGFILE="${DISINTO_LOG_DIR}/site/collect-engagement.log" log() { printf '[%s] collect-engagement: %s\n' \ "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE" } # ── Configuration ──────────────────────────────────────────────────────────── # Caddy structured access log (JSON lines) CADDY_LOG="${CADDY_ACCESS_LOG:-/var/log/caddy/access.log}" # Evidence output directory (committed to ops repo) EVIDENCE_DIR="${OPS_REPO_ROOT}/evidence/engagement" # Report date — defaults to today REPORT_DATE=$(date -u +%Y-%m-%d) # Cutoff: only process entries from the last 24 hours CUTOFF_TS=$(date -u -d '24 hours ago' +%s 2>/dev/null \ || date -u -v-24H +%s 2>/dev/null \ || echo 0) # ── Preflight checks ──────────────────────────────────────────────────────── if [ ! -f "$CADDY_LOG" ]; then log "ERROR: Caddy access log not found at ${CADDY_LOG}" echo "ERROR: Caddy access log not found at ${CADDY_LOG}" >&2 echo "Set CADDY_ACCESS_LOG to the correct path." >&2 exit 1 fi if ! command -v jq &>/dev/null; then log "ERROR: jq is required but not installed" exit 1 fi mkdir -p "$EVIDENCE_DIR" # Verify input is Caddy JSON format (not Combined Log Format or other) first_line=$(grep -m1 '.' "$CADDY_LOG" || true) if [ -z "$first_line" ]; then log "WARN: Caddy access log is empty at ${CADDY_LOG}" echo "WARN: Caddy access log is empty — nothing to parse." >&2 exit 0 fi if ! printf '%s\n' "$first_line" | jq empty 2>/dev/null; then preview="${first_line:0:200}" log "ERROR: Input file is not Caddy JSON format (expected structured JSON access log). Got: ${preview}" echo "ERROR: Input file is not Caddy JSON format (expected structured JSON access log)." >&2 echo "Got: ${preview}" >&2 exit 1 fi # ── Parse access log ──────────────────────────────────────────────────────── log "Parsing ${CADDY_LOG} for entries since $(date -u -d "@${CUTOFF_TS}" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "${CUTOFF_TS}")" # Extract relevant fields from Caddy JSON log lines. # Caddy v2 structured log format: # ts (float epoch), request.uri, request.remote_ip, request.headers.Referer, # request.headers.User-Agent, status, size, duration # # Filter to last 24h, exclude assets/bots, produce a clean JSONL stream. PARSED=$(jq -Rc --argjson cutoff "$CUTOFF_TS" ' try fromjson | select(.ts >= $cutoff) | select(.request.uri != null) | { ts: .ts, ip: (.request.remote_ip // .request.remote_addr // "unknown" | split(":")[0]), uri: .request.uri, status: .status, size: .size, duration: .duration, referer: (.request.headers.Referer[0] // .request.headers.referer[0] // "direct"), ua: (.request.headers["User-Agent"][0] // .request.headers["user-agent"][0] // "unknown") } ' "$CADDY_LOG" 2>/dev/null || echo "") if [ -z "$PARSED" ]; then log "No entries found in the last 24 hours" jq -nc \ --arg date "$REPORT_DATE" \ --arg source "$CADDY_LOG" \ '{ date: $date, source: $source, period_hours: 24, total_requests: 0, unique_visitors: 0, page_views: 0, referred_visitors: 0, top_pages: [], top_referrers: [], response_time: { p50_seconds: 0, p95_seconds: 0, p99_seconds: 0 }, note: "no entries in period" }' > "${EVIDENCE_DIR}/${REPORT_DATE}.json" log "Empty report written to ${EVIDENCE_DIR}/${REPORT_DATE}.json" exit 0 fi # ── Compute engagement metrics ────────────────────────────────────────────── # Filter out static assets and known bots for page-view metrics PAGES=$(printf '%s\n' "$PARSED" | jq -c ' select( (.uri | test("\\.(css|js|png|jpg|jpeg|webp|ico|svg|woff2?|ttf|map)$") | not) and (.ua | test("bot|crawler|spider|slurp|Googlebot|Bingbot|YandexBot"; "i") | not) and (.status >= 200 and .status < 400) ) ') TOTAL_REQUESTS=$(printf '%s\n' "$PARSED" | wc -l | tr -d ' ') PAGE_VIEWS=$(printf '%s\n' "$PAGES" | grep -c . || true) PAGE_VIEWS="${PAGE_VIEWS:-0}" UNIQUE_VISITORS=$(printf '%s\n' "$PAGES" | jq -r '.ip' | sort -u | wc -l | tr -d ' ') # Top pages by hit count TOP_PAGES=$(printf '%s\n' "$PAGES" | jq -r '.uri' \ | sort | uniq -c | sort -rn | head -10 \ | awk '{printf "{\"path\":\"%s\",\"views\":%d}\n", $2, $1}' \ | jq -sc '.' 2>/dev/null || echo '[]') # Top referrers (exclude direct/self) TOP_REFERRERS=$(printf '%s\n' "$PAGES" | jq -r '.referer' \ | grep -v '^direct$' \ | grep -v '^-$' \ | grep -v 'disinto\.ai' \ | sort | uniq -c | sort -rn | head -10 \ | awk '{printf "{\"source\":\"%s\",\"visits\":%d}\n", $2, $1}' \ | jq -sc '.' 2>/dev/null || echo '[]') # Unique visitors who came from external referrers REFERRED_VISITORS=$(printf '%s\n' "$PAGES" | jq -r 'select(.referer != "direct" and .referer != "-" and (.referer | test("disinto\\.ai") | not)) | .ip' \ | sort -u | wc -l | tr -d ' ') # Response time stats (p50, p95, p99 in ms) RESPONSE_TIMES=$(printf '%s\n' "$PAGES" | jq -r '.duration // 0' | sort -n) RT_COUNT=$(printf '%s\n' "$RESPONSE_TIMES" | wc -l | tr -d ' ') if [ "$RT_COUNT" -gt 0 ]; then P50_IDX=$(( (RT_COUNT * 50 + 99) / 100 )) P95_IDX=$(( (RT_COUNT * 95 + 99) / 100 )) P99_IDX=$(( (RT_COUNT * 99 + 99) / 100 )) P50=$(printf '%s\n' "$RESPONSE_TIMES" | sed -n "${P50_IDX}p") P95=$(printf '%s\n' "$RESPONSE_TIMES" | sed -n "${P95_IDX}p") P99=$(printf '%s\n' "$RESPONSE_TIMES" | sed -n "${P99_IDX}p") else P50=0; P95=0; P99=0 fi # ── Write evidence ────────────────────────────────────────────────────────── OUTPUT="${EVIDENCE_DIR}/${REPORT_DATE}.json" jq -nc \ --arg date "$REPORT_DATE" \ --arg source "$CADDY_LOG" \ --argjson total_requests "$TOTAL_REQUESTS" \ --argjson page_views "$PAGE_VIEWS" \ --argjson unique_visitors "$UNIQUE_VISITORS" \ --argjson referred_visitors "$REFERRED_VISITORS" \ --argjson top_pages "$TOP_PAGES" \ --argjson top_referrers "$TOP_REFERRERS" \ --argjson p50 "${P50:-0}" \ --argjson p95 "${P95:-0}" \ --argjson p99 "${P99:-0}" \ '{ date: $date, source: $source, period_hours: 24, total_requests: $total_requests, page_views: $page_views, unique_visitors: $unique_visitors, referred_visitors: $referred_visitors, top_pages: $top_pages, top_referrers: $top_referrers, response_time: { p50_seconds: $p50, p95_seconds: $p95, p99_seconds: $p99 } }' > "$OUTPUT" log "Engagement report written to ${OUTPUT}: ${UNIQUE_VISITORS} visitors, ${PAGE_VIEWS} page views" echo "Engagement report: ${UNIQUE_VISITORS} unique visitors, ${PAGE_VIEWS} page views → ${OUTPUT}" # ── Commit evidence to ops repo via Forgejo API ───────────────────────────── commit_evidence_via_forgejo() { local evidence_file="$1" local report_date report_date=$(basename "$evidence_file" .json) local file_path="evidence/engagement/${report_date}.json" # Check if ops repo is available if [ -z "${OPS_REPO_ROOT:-}" ] || [ ! -d "${OPS_REPO_ROOT}/.git" ]; then log "SKIP: OPS_REPO_ROOT not set or not a git repo — evidence file not committed" return 0 fi # Check if Forgejo credentials are available if [ -z "${FORGE_TOKEN:-}" ] || [ -z "${FORGE_URL:-}" ] || [ -z "${FORGE_OPS_REPO:-}" ]; then log "SKIP: Forgejo credentials not available (FORGE_TOKEN/FORGE_URL/FORGE_OPS_REPO) — evidence file not committed" return 0 fi # Read and encode the file content local content content=$(base64 < "$evidence_file") local ops_owner="${OPS_FORGE_OWNER:-${FORGE_REPO%%/*}}" local ops_repo="${OPS_FORGE_REPO:-${PROJECT_NAME:-disinto}-ops}" # Check if file already exists in the ops repo local existing existing=$(curl -sf \ -H "Authorization: token ${FORGE_TOKEN}" \ "${FORGE_URL}/api/v1/repos/${ops_owner}/${ops_repo}/contents/${file_path}" \ 2>/dev/null || echo "") if [ -n "$existing" ] && printf '%s' "$existing" | jq -e '.sha' >/dev/null 2>&1; then # Update existing file local sha sha=$(printf '%s' "$existing" | jq -r '.sha') if curl -sf -X PUT \ -H "Authorization: token ${FORGE_TOKEN}" \ -H "Content-Type: application/json" \ "${FORGE_URL}/api/v1/repos/${ops_owner}/${ops_repo}/contents/${file_path}" \ -d "$(jq -nc --arg content "$content" --arg sha "$sha" --arg msg "evidence: engagement ${report_date}" \ '{message: $msg, content: $content, sha: $sha}')" >/dev/null 2>&1; then log "Updated evidence file in ops repo: ${file_path}" return 0 else log "ERROR: failed to update evidence file in ops repo" return 1 fi else # Create new file if curl -sf -X POST \ -H "Authorization: token ${FORGE_TOKEN}" \ -H "Content-Type: application/json" \ "${FORGE_URL}/api/v1/repos/${ops_owner}/${ops_repo}/contents/${file_path}" \ -d "$(jq -nc --arg content "$content" --arg msg "evidence: engagement ${report_date}" \ '{message: $msg, content: $content}')" >/dev/null 2>&1; then log "Created evidence file in ops repo: ${file_path}" return 0 else log "ERROR: failed to create evidence file in ops repo" return 1 fi fi } # Attempt to commit evidence (non-fatal — data collection succeeded even if commit fails) commit_evidence_via_forgejo "$OUTPUT" || log "WARNING: evidence commit skipped or failed — file exists locally at ${OUTPUT}"