From 3b84474d6d711de7d993c75cf34902a53abcea3c Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 21:40:07 +0000 Subject: [PATCH 1/4] Tag Slack alerts with env/URL and fix Crashes summary count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So one Slack channel can serve all environments, include the server maturity and URL on every alert. Also fix the History summary's Crashes card: crashes aren't recorded as their own event type — they're scale_down rows with payload.kind="crashed", so the previous lookup for type="crash" always returned zero. --- workers/monitor/alerts.py | 6 ++++++ workers/monitor/storage.py | 22 ++++++++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/workers/monitor/alerts.py b/workers/monitor/alerts.py index 08b5c05..6ad874b 100644 --- a/workers/monitor/alerts.py +++ b/workers/monitor/alerts.py @@ -306,8 +306,14 @@ async def _dispatch_slack(event: Dict[str, Any]) -> None: "warning": ":warning:", "critical": ":rotating_light:", }.get(event.get("severity", "warning"), ":warning:") + # Environment context lets one Slack channel receive alerts from multiple + # deployments (dev / staging / production) without ambiguity about which + # one fired. + server_url = settings.server_url or "unknown" + maturity = settings.server_maturity or "unknown" text = ( f"{emoji} *Shepherd alert* `{event['rule']}` ({event['severity']})\n" + f"*Environment:* {maturity} | *URL:* <{server_url}|{server_url}>\n" f"{event['message']}" ) try: diff --git a/workers/monitor/storage.py b/workers/monitor/storage.py index a0b56b4..6077db5 100644 --- a/workers/monitor/storage.py +++ b/workers/monitor/storage.py @@ -395,22 +395,28 @@ async def query_summary(since: float, until: float) -> Dict[str, Any]: row = await cur.fetchone() summary["queries_started"] = int(row[0] or 0) - # Event counts. + # Event counts. Crashes are not their own row type -- the poller + # records them as scale_down events with ``payload.kind = "crashed"`` + # (a clean scale-down uses kind="scaled_down"), so we have to dig + # into the JSON payload to separate the two. cur = await conn.execute( - "SELECT type, COUNT(*) FROM monitor_events " + "SELECT type, COUNT(*) FILTER (WHERE payload->>'kind' = 'crashed'), " + "COUNT(*) FROM monitor_events " "WHERE ts >= to_timestamp(%s) AND ts < to_timestamp(%s) " "GROUP BY type", (since, until), ) - for etype, count in await cur.fetchall(): - if etype == "crash": - summary["crashes"] = int(count) - elif etype in ("scale_up", "scale_down"): + for etype, crashed_count, total_count in await cur.fetchall(): + if etype in ("scale_up", "scale_down"): summary["scale_events"] = summary.get("scale_events", 0) + int( - count + total_count ) + if etype == "scale_down": + summary["crashes"] = summary.get("crashes", 0) + int( + crashed_count or 0 + ) elif etype == "alert": - summary["alert_count"] = int(count) + summary["alert_count"] = int(total_count) # Peak backlog per stream. cur = await conn.execute( From eee5a560f55c40bbfe228700ba2475141cbf68ab Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 22:07:18 +0000 Subject: [PATCH 2/4] Persist hidden-series state across live chart updates The Live view's XLEN chart rebuilds its datasets on every snapshot, so toggling a series off via the legend bounced back on the next tick. Mirror the user's hidden choices in a label-keyed Set and reapply on each rebuild. --- workers/monitor/static/app.js | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/workers/monitor/static/app.js b/workers/monitor/static/app.js index c01a44c..0ae660a 100644 --- a/workers/monitor/static/app.js +++ b/workers/monitor/static/app.js @@ -22,6 +22,10 @@ let xlenChart, araChart; let xlenHistory = {}; // stream -> [{x, y}, ...] const HISTORY_POINTS = 60; // rolling window in the chart + // Tracks chart legend items the user clicked to hide. Datasets are rebuilt + // on every snapshot tick, which wipes Chart.js's internal visibility state, + // so we mirror it here (keyed by series label) and re-apply on each rebuild. + const xlenHiddenSeries = new Set(); function fmt(n) { if (n === null || n === undefined) return "-"; @@ -56,7 +60,18 @@ animation: false, parsing: false, plugins: { - legend: { labels: { color: "#8b949e", boxWidth: 10 } }, + legend: { + labels: { color: "#8b949e", boxWidth: 10 }, + onClick: (e, legendItem, legend) => { + Chart.defaults.plugins.legend.onClick(e, legendItem, legend); + const label = legendItem.text; + if (legend.chart.isDatasetVisible(legendItem.datasetIndex)) { + xlenHiddenSeries.delete(label); + } else { + xlenHiddenSeries.add(label); + } + }, + }, title: { display: true, text: "Queue depth (XLEN)", color: "#e6edf3" }, }, scales: { @@ -231,6 +246,7 @@ borderWidth: 1.5, pointRadius: 0, tension: 0.2, + hidden: xlenHiddenSeries.has(n), })); xlenChart.update("none"); } From 3cb2265e6c82d2de0cda92410fe43d7ef58db2a7 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 25 May 2026 01:30:00 +0000 Subject: [PATCH 3/4] Use AsyncClient in ARAX worker so heartbeats keep flowing ARAX was the only worker still using the synchronous httpx.Client. Its post() blocks the event loop for the full request (up to 270s), which stops the heartbeat task from refreshing its Redis key. Any ARAX response slower than the 15s heartbeat TTL caused the monitor to flag the worker as crashed and fire a false-positive alert. --- workers/arax/worker.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/workers/arax/worker.py b/workers/arax/worker.py index 44647bb..a6ad652 100644 --- a/workers/arax/worker.py +++ b/workers/arax/worker.py @@ -31,8 +31,13 @@ async def arax(task, logger: logging.Logger): logger.info(f"Get the message from db {message}") headers = {"Content-Type": "application/json"} - with httpx.Client(timeout=270) as client: - response = client.post(settings.arax_url, json=message, headers=headers) + # Must be AsyncClient: the sync httpx.Client blocks the event loop for + # the whole request, which starves the heartbeat task and makes the + # monitor flag this worker as crashed on any slow ARAX response. + async with httpx.AsyncClient(timeout=270) as client: + response = await client.post( + settings.arax_url, json=message, headers=headers + ) logger.info(f"Status Code from ARAX response: {response.status_code}") result = response.json() From 6cedb4d263bf1b345a5298c4c9e460b6398de62c Mon Sep 17 00:00:00 2001 From: Max Wang Date: Tue, 26 May 2026 16:03:22 -0400 Subject: [PATCH 4/4] Remove unnecessary comments --- workers/arax/worker.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/workers/arax/worker.py b/workers/arax/worker.py index a6ad652..39d110c 100644 --- a/workers/arax/worker.py +++ b/workers/arax/worker.py @@ -31,9 +31,6 @@ async def arax(task, logger: logging.Logger): logger.info(f"Get the message from db {message}") headers = {"Content-Type": "application/json"} - # Must be AsyncClient: the sync httpx.Client blocks the event loop for - # the whole request, which starves the heartbeat task and makes the - # monitor flag this worker as crashed on any slow ARAX response. async with httpx.AsyncClient(timeout=270) as client: response = await client.post( settings.arax_url, json=message, headers=headers