Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions scripts/deploy/lib/checkTelemetryErrors.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,10 @@ describe('check-telemetry-errors', () => {
NO_TELEMETRY_ERRORS_ON_SPECIFIC_ORG_MOCK,
NO_TELEMETRY_ERROR_ON_SPECIFIC_MESSAGE_MOCK,
])
await assert.rejects(() => checkTelemetryErrors(['us1'], '6.2.1'), /Telemetry errors found in the last 5 minutes/)
await assert.rejects(
() => checkTelemetryErrors(['us1'], '6.2.1'),
/\[datadoghq\.com\] Telemetry errors: found 10000 events \(threshold: 300\) in the last 5 minutes/
)
})

it('should throw an error if telemetry errors on specific org are found for a given datacenter', async () => {
Expand All @@ -111,7 +114,7 @@ describe('check-telemetry-errors', () => {

await assert.rejects(
() => checkTelemetryErrors(['us1'], '6.2.1'),
/Telemetry errors on specific org found in the last 5 minutes/
/\[datadoghq\.com\] Telemetry errors on specific org: found 500 events \(threshold: 100\) in the last 5 minutes/
)
})

Expand All @@ -124,7 +127,7 @@ describe('check-telemetry-errors', () => {

await assert.rejects(
() => checkTelemetryErrors(['us1'], '6.2.1'),
/Telemetry error on specific message found in the last 5 minutes/
/\[datadoghq\.com\] Telemetry error on specific message: found 1600 events \(threshold: 100\) in the last 5 minutes/
)
})

Expand Down
94 changes: 55 additions & 39 deletions scripts/deploy/lib/checkTelemetryErrors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ function getQueries(version: string): Query[] {
export async function checkTelemetryErrors(datacenters: string[], version: string): Promise<void> {
const queries = getQueries(version)

printLog(
`Checking telemetry errors for version ${version} across ${datacenters.length} datacenter(s): ${datacenters.join(', ')}`
)

// Create a fresh HTTP agent for this batch of telemetry checks
const agent = createTelemetryAgent()

Expand All @@ -83,20 +87,23 @@ async function checkDatacenterTelemetryErrors(datacenter: string, queries: Query
const applicationKey = getTelemetryOrgApplicationKey(site)

if (!apiKey || !applicationKey) {
printLog(`No API key or application key found for ${site}, skipping...`)
printLog(`[${site}] No API key or application key found for ${site}, skipping...`)
return
}

for (let i = 0; i < queries.length; i++) {
const query = queries[i]
const buckets = await queryLogsApi(site, apiKey, applicationKey, query, agent)
const count = buckets[0]?.computes?.c0

// buckets are sorted by count, so we only need to check the first one
if (buckets[0]?.computes?.c0 > query.threshold) {
throw new Error(`${query.name} found in the last ${TIME_WINDOW_IN_MINUTES} minutes,
if (count > query.threshold) {
throw new Error(`[${site}] ${query.name}: found ${count} events (threshold: ${query.threshold}) in the last ${TIME_WINDOW_IN_MINUTES} minutes,
Comment thread
BenoitZugmeyer marked this conversation as resolved.
see ${computeLogsLink(site, query)}`)
}

printLog(`[${site}] ${query.name}: found ${count} event(s) (threshold: ${query.threshold})`)

// Skip rate limit delay after last query
if (i < queries.length - 1) {
await timeout(RATE_LIMIT_DELAY_MS)
Expand All @@ -112,52 +119,61 @@ async function queryLogsApi(
agent: Agent,
attempt: number = 1
): Promise<QueryResultBucket[]> {
const response = await fetch(`https://api.${site}/api/v2/logs/analytics/aggregate`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'DD-API-KEY': apiKey,
'DD-APPLICATION-KEY': applicationKey,
},
body: JSON.stringify({
compute: [
{
aggregation: 'count',
},
],
...(query.groupBy
? {
group_by: [
{
facet: query.groupBy,
sort: {
type: 'measure',
aggregation: 'count',
},
},
],
}
: {}),
filter: {
from: `now-${TIME_WINDOW_IN_MINUTES}m`,
to: 'now',
query: query.query,
let response: Response
try {
response = await fetch(`https://api.${site}/api/v2/logs/analytics/aggregate`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'DD-API-KEY': apiKey,
'DD-APPLICATION-KEY': applicationKey,
},
}),
// Use dedicated agent to avoid connection pool conflicts.
dispatcher: agent,
})
body: JSON.stringify({
compute: [
{
aggregation: 'count',
},
],
...(query.groupBy
? {
group_by: [
{
facet: query.groupBy,
sort: {
type: 'measure',
aggregation: 'count',
},
},
],
}
: {}),
filter: {
from: `now-${TIME_WINDOW_IN_MINUTES}m`,
to: 'now',
query: query.query,
},
}),
// Use dedicated agent to avoid connection pool conflicts.
dispatcher: agent,
})
} catch (error) {
throw new Error(`[${site}] Network error on attempt ${attempt}/${MAX_RETRIES} for "${query.name}"`, {
cause: error,
})
}

if (shouldRetry(response, attempt)) {
printLog(
`503 Service Unavailable, retrying in ${RATE_LIMIT_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})...`
`[${site}] 503 Service Unavailable for "${query.name}", retrying in ${RATE_LIMIT_DELAY_MS / 1000}s (attempt ${attempt}/${MAX_RETRIES})...`
)
await timeout(RATE_LIMIT_DELAY_MS)
return queryLogsApi(site, apiKey, applicationKey, query, agent, attempt + 1)
}

if (!response.ok) {
throw await createFetchError(response)
throw new Error(`[${site}] HTTP error on attempt ${attempt}/${MAX_RETRIES} for "${query.name}"`, {
cause: await createFetchError(response),
})
}

const data = (await response.json()) as QueryResult
Expand Down
3 changes: 2 additions & 1 deletion scripts/lib/datacenter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ async function fetchDatacentersFromRuntimeMetadataService(): Promise<Datacenters
const token = await getVaultToken()

// Filter for production environment and site flavor only
const selector = 'datacenter.environment == "prod" && datacenter.flavor == "site"'
const selector =
'datacenter.environment == "prod" && datacenter.flavor == "site" && datacenter.status != "deprecated"'

const response = await fetchHandlingError(
`${RUNTIME_METADATA_SERVICE_URL}?selector=${encodeURIComponent(selector)}`,
Expand Down
Loading