From 273c4613a987131143af442e9e619cb6793c86ca Mon Sep 17 00:00:00 2001 From: Benoit Zugmeyer Date: Mon, 15 Jun 2026 12:06:04 +0200 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=94=8D=20Add=20detailed=20logging=20t?= =?UTF-8?q?o=20checkTelemetryErrors=20for=20easier=20debugging?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/deploy/lib/checkTelemetryErrors.ts | 94 +++++++++++++--------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/scripts/deploy/lib/checkTelemetryErrors.ts b/scripts/deploy/lib/checkTelemetryErrors.ts index 8457bad9ef..1f93331c16 100644 --- a/scripts/deploy/lib/checkTelemetryErrors.ts +++ b/scripts/deploy/lib/checkTelemetryErrors.ts @@ -57,6 +57,10 @@ function getQueries(version: string): Query[] { export async function checkTelemetryErrors(datacenters: string[], version: string): Promise { const queries = getQueries(version) + printLog( + `Checking telemetry errors for version ${version} across ${datacenters.length} datacenter(s): ${datacenters.join(', ')}` + ) + // Create a fresh HTTP agent for this batch of telemetry checks const agent = createTelemetryAgent() @@ -83,20 +87,23 @@ async function checkDatacenterTelemetryErrors(datacenter: string, queries: Query const applicationKey = getTelemetryOrgApplicationKey(site) if (!apiKey || !applicationKey) { - printLog(`No API key or application key found for ${site}, skipping...`) + printLog(`[${site}] No API key or application key found for ${site}, skipping...`) return } for (let i = 0; i < queries.length; i++) { const query = queries[i] const buckets = await queryLogsApi(site, apiKey, applicationKey, query, agent) + const count = buckets[0]?.computes?.c0 // buckets are sorted by count, so we only need to check the first one - if (buckets[0]?.computes?.c0 > query.threshold) { - throw new Error(`${query.name} found in the last ${TIME_WINDOW_IN_MINUTES} minutes, + if (count > query.threshold) { + throw new Error(`[${site}] ${query.name}: found ${count} events (threshold: ${query.threshold}) in the last ${TIME_WINDOW_IN_MINUTES} minutes, see ${computeLogsLink(site, query)}`) } + printLog(`[${site}] ${query.name}: found ${count} event(s) (threshold: ${query.threshold})`) + // Skip rate limit delay after last query if (i < queries.length - 1) { await timeout(RATE_LIMIT_DELAY_MS) @@ -112,52 +119,61 @@ async function queryLogsApi( agent: Agent, attempt: number = 1 ): Promise { - const response = await fetch(`https://api.${site}/api/v2/logs/analytics/aggregate`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'DD-API-KEY': apiKey, - 'DD-APPLICATION-KEY': applicationKey, - }, - body: JSON.stringify({ - compute: [ - { - aggregation: 'count', - }, - ], - ...(query.groupBy - ? { - group_by: [ - { - facet: query.groupBy, - sort: { - type: 'measure', - aggregation: 'count', - }, - }, - ], - } - : {}), - filter: { - from: `now-${TIME_WINDOW_IN_MINUTES}m`, - to: 'now', - query: query.query, + let response: Response + try { + response = await fetch(`https://api.${site}/api/v2/logs/analytics/aggregate`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'DD-API-KEY': apiKey, + 'DD-APPLICATION-KEY': applicationKey, }, - }), - // Use dedicated agent to avoid connection pool conflicts. - dispatcher: agent, - }) + body: JSON.stringify({ + compute: [ + { + aggregation: 'count', + }, + ], + ...(query.groupBy + ? { + group_by: [ + { + facet: query.groupBy, + sort: { + type: 'measure', + aggregation: 'count', + }, + }, + ], + } + : {}), + filter: { + from: `now-${TIME_WINDOW_IN_MINUTES}m`, + to: 'now', + query: query.query, + }, + }), + // Use dedicated agent to avoid connection pool conflicts. + dispatcher: agent, + }) + } catch (error) { + throw new Error(`[${site}] Network error on attempt ${attempt}/${MAX_RETRIES} for "${query.name}"`, { + cause: error, + }) + } if (shouldRetry(response, attempt)) { printLog( - `503 Service Unavailable, retrying in ${RATE_LIMIT_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})...` + `[${site}] 503 Service Unavailable for "${query.name}", retrying in ${RATE_LIMIT_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})...` ) await timeout(RATE_LIMIT_DELAY_MS) return queryLogsApi(site, apiKey, applicationKey, query, agent, attempt + 1) } if (!response.ok) { - throw await createFetchError(response) + throw new Error(`[${site}] HTTP error on attempt ${attempt}/${MAX_RETRIES} for "${query.name}"`, { + cause: await createFetchError(response), + }) } const data = (await response.json()) as QueryResult From 529634f65eb9cadb42ccb27a08a2949b32006b65 Mon Sep 17 00:00:00 2001 From: Benoit Zugmeyer Date: Tue, 16 Jun 2026 10:47:49 +0200 Subject: [PATCH 2/3] exclude reprecated datacenters --- scripts/lib/datacenter.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/lib/datacenter.ts b/scripts/lib/datacenter.ts index d5f4ba5849..fa3878d45c 100644 --- a/scripts/lib/datacenter.ts +++ b/scripts/lib/datacenter.ts @@ -68,7 +68,8 @@ async function fetchDatacentersFromRuntimeMetadataService(): Promise Date: Tue, 16 Jun 2026 10:57:09 +0200 Subject: [PATCH 3/3] =?UTF-8?q?=E2=9C=85=20update=20error=20assertions=20i?= =?UTF-8?q?n=20checkTelemetryErrors=20tests=20to=20match=20new=20message?= =?UTF-8?q?=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/deploy/lib/checkTelemetryErrors.spec.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/deploy/lib/checkTelemetryErrors.spec.ts b/scripts/deploy/lib/checkTelemetryErrors.spec.ts index 5eeca37dff..9243c27e9f 100644 --- a/scripts/deploy/lib/checkTelemetryErrors.spec.ts +++ b/scripts/deploy/lib/checkTelemetryErrors.spec.ts @@ -99,7 +99,10 @@ describe('check-telemetry-errors', () => { NO_TELEMETRY_ERRORS_ON_SPECIFIC_ORG_MOCK, NO_TELEMETRY_ERROR_ON_SPECIFIC_MESSAGE_MOCK, ]) - await assert.rejects(() => checkTelemetryErrors(['us1'], '6.2.1'), /Telemetry errors found in the last 5 minutes/) + await assert.rejects( + () => checkTelemetryErrors(['us1'], '6.2.1'), + /\[datadoghq\.com\] Telemetry errors: found 10000 events \(threshold: 300\) in the last 5 minutes/ + ) }) it('should throw an error if telemetry errors on specific org are found for a given datacenter', async () => { @@ -111,7 +114,7 @@ describe('check-telemetry-errors', () => { await assert.rejects( () => checkTelemetryErrors(['us1'], '6.2.1'), - /Telemetry errors on specific org found in the last 5 minutes/ + /\[datadoghq\.com\] Telemetry errors on specific org: found 500 events \(threshold: 100\) in the last 5 minutes/ ) }) @@ -124,7 +127,7 @@ describe('check-telemetry-errors', () => { await assert.rejects( () => checkTelemetryErrors(['us1'], '6.2.1'), - /Telemetry error on specific message found in the last 5 minutes/ + /\[datadoghq\.com\] Telemetry error on specific message: found 1600 events \(threshold: 100\) in the last 5 minutes/ ) })