From 574733b0e11f755fcb937474600f66d26f685fda Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 23 Apr 2026 19:15:56 +0100 Subject: [PATCH 1/2] feat(webapp): add per-worker Node.js heap metrics Extends the existing nodejs.* OTel gauges in tracer.server.ts with direct V8 heap + process memory readings via v8.getHeapStatistics() and process.memoryUsage(): - nodejs.memory.heap.used - V8 heap used after last GC - nodejs.memory.heap.total - V8 heap reserved - nodejs.memory.heap.limit - configured max-old-space-size - nodejs.memory.external - C++ objects bound to JS (Buffer, etc.) - nodejs.memory.array_buffers - ArrayBuffer/SharedArrayBuffer memory - nodejs.memory.rss - resident set size @opentelemetry/host-metrics already publishes process.memory.usage (RSS), but RSS overstates V8 heap by the external + native footprint. Without a direct heap metric it's impossible to size NODE_MAX_OLD_SPACE_SIZE against actual V8 usage. These gauges land in the same trigger.dev scope and carry the same per-worker tags (process.executable.name, service.instance.id) so they're queryable alongside the existing event-loop + handle metrics on a per-cluster-worker basis. --- .server-changes/nodejs-heap-metrics.md | 6 +++ apps/webapp/app/v3/tracer.server.ts | 60 ++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 .server-changes/nodejs-heap-metrics.md diff --git a/.server-changes/nodejs-heap-metrics.md b/.server-changes/nodejs-heap-metrics.md new file mode 100644 index 0000000000..bb82fcca99 --- /dev/null +++ b/.server-changes/nodejs-heap-metrics.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Add per-worker Node.js heap metrics to the OTel meter — `nodejs.memory.heap.used`, `nodejs.memory.heap.total`, `nodejs.memory.heap.limit`, `nodejs.memory.external`, `nodejs.memory.array_buffers`, `nodejs.memory.rss`. Host-metrics only publishes RSS, which overstates V8 heap by the external + native footprint; these give direct heap visibility per cluster worker so `NODE_MAX_OLD_SPACE_SIZE` can be sized against observed heap peaks rather than RSS. diff --git a/apps/webapp/app/v3/tracer.server.ts b/apps/webapp/app/v3/tracer.server.ts index 1115ab42de..c0ac36c446 100644 --- a/apps/webapp/app/v3/tracer.server.ts +++ b/apps/webapp/app/v3/tracer.server.ts @@ -38,6 +38,7 @@ import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions"; import { PrismaInstrumentation } from "@prisma/instrumentation"; import { HostMetrics } from "@opentelemetry/host-metrics"; import { AwsInstrumentation as AwsSdkInstrumentation } from "@opentelemetry/instrumentation-aws-sdk"; +import v8 from "node:v8"; import { awsEcsDetector, awsEc2Detector } from "@opentelemetry/resource-detector-aws"; import { detectResources, @@ -630,6 +631,39 @@ function configureNodejsMetrics({ meter }: { meter: Meter }) { unit: "1", // OpenTelemetry convention for ratios }); + // V8 heap + process memory. `NODE_MAX_OLD_SPACE_SIZE` caps V8 old space + // (reflected in `heap.limit`), but doesn't cap external/arrayBuffers/native + // memory — which is why RSS can exceed the heap total. Tracking all of these + // per-worker lets us size `NODE_MAX_OLD_SPACE_SIZE` against observed heap + // peaks rather than RSS (which overstates heap by the external + native + // footprint). `host-metrics` already publishes `process.memory.usage` + // (RSS), but we duplicate it under `nodejs.memory.rss` so all the memory + // numbers land in the same scope and are queryable together. + const heapUsedGauge = meter.createObservableGauge("nodejs.memory.heap.used", { + description: "V8 heap actively in use after the last GC", + unit: "By", + }); + const heapTotalGauge = meter.createObservableGauge("nodejs.memory.heap.total", { + description: "V8 heap reserved (young + old generations)", + unit: "By", + }); + const heapLimitGauge = meter.createObservableGauge("nodejs.memory.heap.limit", { + description: "V8 heap size limit (configured via --max-old-space-size)", + unit: "By", + }); + const externalMemoryGauge = meter.createObservableGauge("nodejs.memory.external", { + description: "Memory used by C++ objects bound to JS (Buffer, etc.)", + unit: "By", + }); + const arrayBuffersGauge = meter.createObservableGauge("nodejs.memory.array_buffers", { + description: "Memory allocated for ArrayBuffers and SharedArrayBuffers", + unit: "By", + }); + const rssGauge = meter.createObservableGauge("nodejs.memory.rss", { + description: "Resident set size — total physical memory held by the process", + unit: "By", + }); + // Get UV threadpool size (defaults to 4 if not set) const uvThreadpoolSize = parseInt(process.env.UV_THREADPOOL_SIZE || "4", 10); @@ -687,6 +721,9 @@ function configureNodejsMetrics({ meter }: { meter: Meter }) { // diff.utilization is between 0 and 1 (fraction of time "active") const utilization = Number.isFinite(diff.utilization) ? diff.utilization : 0; + const mem = process.memoryUsage(); + const heapStats = v8.getHeapStatistics(); + return { threadpoolSize: uvThreadpoolSize, handlesByType, @@ -702,6 +739,14 @@ function configureNodejsMetrics({ meter }: { meter: Meter }) { p99: eventLoopLagP99?.values?.[0]?.value ?? 0, utilization, }, + memory: { + heapUsed: mem.heapUsed, + heapTotal: mem.heapTotal, + heapLimit: heapStats.heap_size_limit, + external: mem.external, + arrayBuffers: mem.arrayBuffers, + rss: mem.rss, + }, }; } @@ -714,6 +759,7 @@ function configureNodejsMetrics({ meter }: { meter: Meter }) { requestsByType, requestsTotal, eventLoop, + memory, } = await readNodeMetrics(); // Observe UV threadpool size @@ -739,6 +785,14 @@ function configureNodejsMetrics({ meter }: { meter: Meter }) { res.observe(eventLoopLagP90Gauge, eventLoop.p90); res.observe(eventLoopLagP99Gauge, eventLoop.p99); res.observe(eluGauge, eventLoop.utilization); + + // Observe memory metrics (bytes) + res.observe(heapUsedGauge, memory.heapUsed); + res.observe(heapTotalGauge, memory.heapTotal); + res.observe(heapLimitGauge, memory.heapLimit); + res.observe(externalMemoryGauge, memory.external); + res.observe(arrayBuffersGauge, memory.arrayBuffers); + res.observe(rssGauge, memory.rss); }, [ uvThreadpoolSizeGauge, @@ -753,6 +807,12 @@ function configureNodejsMetrics({ meter }: { meter: Meter }) { eventLoopLagP90Gauge, eventLoopLagP99Gauge, eluGauge, + heapUsedGauge, + heapTotalGauge, + heapLimitGauge, + externalMemoryGauge, + arrayBuffersGauge, + rssGauge, ] ); } From 75ea38fdcd1a75c54bbf9a4053f39498e4d4637f Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Thu, 23 Apr 2026 19:42:37 +0100 Subject: [PATCH 2/2] fix(webapp): rotate ELU baseline so it reports per-interval, not cumulative lastEventLoopUtilization was set once at init and never reassigned, so every performance.eventLoopUtilization(current, last) diff was computed against the process-start snapshot. The nodejs.event_loop.utilization gauge was therefore a cumulative average over process lifetime rather than a per-interval measurement. Rotate the baseline immediately after computing the diff. --- apps/webapp/app/v3/tracer.server.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/webapp/app/v3/tracer.server.ts b/apps/webapp/app/v3/tracer.server.ts index c0ac36c446..3b924ff8a1 100644 --- a/apps/webapp/app/v3/tracer.server.ts +++ b/apps/webapp/app/v3/tracer.server.ts @@ -717,6 +717,9 @@ function configureNodejsMetrics({ meter }: { meter: Meter }) { currentEventLoopUtilization, lastEventLoopUtilization ); + // Rotate the baseline so the next collection reports per-interval + // utilization rather than the cumulative average from process start. + lastEventLoopUtilization = currentEventLoopUtilization; // diff.utilization is between 0 and 1 (fraction of time "active") const utilization = Number.isFinite(diff.utilization) ? diff.utilization : 0;