diff --git a/cli/src/utils/analytics.ts b/cli/src/utils/analytics.ts index ad5bd3bba6..7fdfa639cb 100644 --- a/cli/src/utils/analytics.ts +++ b/cli/src/utils/analytics.ts @@ -9,6 +9,7 @@ import { IS_PROD as defaultIsProd, DEBUG_ANALYTICS, } from '@codebuff/common/env' +import { shouldTrackAnalyticsEvent } from '@codebuff/common/util/analytics-sampling' import type { AnalyticsEvent } from '@codebuff/common/constants/analytics-events' @@ -211,6 +212,10 @@ export function trackEvent( return } + if (!shouldTrackAnalyticsEvent({ event, distinctId, properties })) { + return + } + try { client.capture({ distinctId, diff --git a/cli/src/utils/logger.ts b/cli/src/utils/logger.ts index 4cc6c0fce8..98a5410420 100644 --- a/cli/src/utils/logger.ts +++ b/cli/src/utils/logger.ts @@ -7,6 +7,10 @@ import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events' import { env, IS_DEV, IS_TEST, IS_CI } from '@codebuff/common/env' import { createAnalyticsDispatcher } from '@codebuff/common/util/analytics-dispatcher' import { getAnalyticsEventId } from '@codebuff/common/util/analytics-log' +import { + isFullTelemetryEnabled, + summarizeAnalyticsValue, +} from '@codebuff/common/util/analytics-sampling' import { pino } from 'pino' import { @@ -169,10 +173,23 @@ function sendAnalyticsAndLog( // Skip if the log already has an eventId (to avoid duplicate tracking) const hasEventId = includeData && getAnalyticsEventId(normalizedData) !== null if (!IS_DEV && !IS_TEST && !IS_CI && !hasEventId) { + const fullTelemetry = isFullTelemetryEnabled({ + distinctId: loggerContext.userId, + properties: loggerContext, + }) + const includeRawData = + fullTelemetry || level === 'error' || level === 'fatal' + const dataProperties = + includeData && includeRawData + ? { data: normalizedData } + : includeData + ? { dataSummary: summarizeAnalyticsValue(normalizedData) } + : {} + trackEvent(AnalyticsEvent.CLI_LOG, { level, msg: stringFormat(normalizedMsg ?? '', ...args), - ...(includeData ? { data: normalizedData } : {}), + ...dataProperties, ...loggerContext, }) } diff --git a/common/src/util/__tests__/analytics-sampling.test.ts b/common/src/util/__tests__/analytics-sampling.test.ts new file mode 100644 index 0000000000..9fcb8fc6c3 --- /dev/null +++ b/common/src/util/__tests__/analytics-sampling.test.ts @@ -0,0 +1,119 @@ +import { afterEach, describe, expect, it } from 'bun:test' + +import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events' + +import { + isFullTelemetryEnabled, + shouldTrackAnalyticsEvent, + summarizeAnalyticsValue, +} from '../analytics-sampling' + +const ORIGINAL_ENV = { + CODEBUFF_FULL_TELEMETRY: process.env.CODEBUFF_FULL_TELEMETRY, + CODEBUFF_FULL_TELEMETRY_IDS: process.env.CODEBUFF_FULL_TELEMETRY_IDS, + CODEBUFF_FULL_TELEMETRY_USER_IDS: + process.env.CODEBUFF_FULL_TELEMETRY_USER_IDS, +} + +function restoreEnv() { + for (const [key, value] of Object.entries(ORIGINAL_ENV)) { + if (value === undefined) { + delete process.env[key] + } else { + process.env[key] = value + } + } +} + +describe('analytics sampling', () => { + afterEach(() => { + restoreEnv() + }) + + it('always tracks core CLI lifecycle events', () => { + expect( + shouldTrackAnalyticsEvent({ + event: AnalyticsEvent.APP_LAUNCHED, + distinctId: 'user-1', + }), + ).toBe(true) + expect( + shouldTrackAnalyticsEvent({ + event: AnalyticsEvent.USER_INPUT_COMPLETE, + distinctId: 'user-1', + }), + ).toBe(true) + }) + + it('always tracks CLI error logs', () => { + expect( + shouldTrackAnalyticsEvent({ + event: AnalyticsEvent.CLI_LOG, + distinctId: 'user-1', + properties: { level: 'error' }, + }), + ).toBe(true) + }) + + it('samples high-volume events deterministically', () => { + const first = shouldTrackAnalyticsEvent({ + event: AnalyticsEvent.TOOL_USE, + distinctId: 'user-1', + }) + const second = shouldTrackAnalyticsEvent({ + event: AnalyticsEvent.TOOL_USE, + distinctId: 'user-1', + }) + const otherEvent = shouldTrackAnalyticsEvent({ + event: AnalyticsEvent.AGENT_STEP, + distinctId: 'user-1', + }) + + expect(second).toBe(first) + expect(typeof otherEvent).toBe('boolean') + }) + + it('honors full telemetry env flags and allowlists', () => { + process.env.CODEBUFF_FULL_TELEMETRY = 'true' + expect( + isFullTelemetryEnabled({ + distinctId: 'anyone', + }), + ).toBe(true) + + delete process.env.CODEBUFF_FULL_TELEMETRY + process.env.CODEBUFF_FULL_TELEMETRY_IDS = 'user-2,person@example.com' + + expect( + isFullTelemetryEnabled({ + distinctId: 'user-2', + }), + ).toBe(true) + expect( + isFullTelemetryEnabled({ + properties: { userEmail: 'person@example.com' }, + }), + ).toBe(true) + expect( + isFullTelemetryEnabled({ + distinctId: 'user-3', + }), + ).toBe(false) + }) + + it('summarizes values without retaining raw contents', () => { + expect(summarizeAnalyticsValue('secret text')).toEqual({ + kind: 'string', + length: 11, + }) + expect(summarizeAnalyticsValue(['a', 'b'])).toEqual({ + kind: 'array', + length: 2, + }) + expect(summarizeAnalyticsValue({ prompt: 'secret', count: 1 })).toEqual({ + kind: 'object', + keyCount: 2, + keys: ['prompt', 'count'], + }) + }) +}) diff --git a/common/src/util/analytics-sampling.ts b/common/src/util/analytics-sampling.ts new file mode 100644 index 0000000000..4e225bcb96 --- /dev/null +++ b/common/src/util/analytics-sampling.ts @@ -0,0 +1,200 @@ +import { AnalyticsEvent } from '../constants/analytics-events' + +const DEFAULT_SAMPLED_RATE = 0.01 + +const SAMPLED_EVENT_RATES: Partial> = { + [AnalyticsEvent.AGENT_STEP]: DEFAULT_SAMPLED_RATE, + [AnalyticsEvent.CHATGPT_OAUTH_REQUEST]: DEFAULT_SAMPLED_RATE, + [AnalyticsEvent.CLI_LOG]: DEFAULT_SAMPLED_RATE, + [AnalyticsEvent.FEEDBACK_BUTTON_HOVERED]: DEFAULT_SAMPLED_RATE, + [AnalyticsEvent.FOLLOWUP_CLICKED]: DEFAULT_SAMPLED_RATE, + [AnalyticsEvent.SLASH_COMMAND_USED]: DEFAULT_SAMPLED_RATE, + [AnalyticsEvent.SLASH_MENU_ACTIVATED]: DEFAULT_SAMPLED_RATE, + [AnalyticsEvent.TOOL_USE]: DEFAULT_SAMPLED_RATE, +} + +const ALWAYS_TRACK_EVENTS = new Set([ + AnalyticsEvent.APP_LAUNCHED, + AnalyticsEvent.CHANGE_DIRECTORY, + AnalyticsEvent.CHATGPT_OAUTH_AUTH_ERROR, + AnalyticsEvent.CHATGPT_OAUTH_RATE_LIMITED, + AnalyticsEvent.FINGERPRINT_GENERATED, + AnalyticsEvent.INVALID_COMMAND, + AnalyticsEvent.KNOWLEDGE_FILE_UPDATED, + AnalyticsEvent.LOGIN, + AnalyticsEvent.TERMINAL_COMMAND_COMPLETED, + AnalyticsEvent.UPDATE_CODEBUFF_FAILED, + AnalyticsEvent.USER_INPUT, + AnalyticsEvent.USER_INPUT_COMPLETE, +]) + +type AnalyticsProperties = Record | undefined + +function getStringProperty( + properties: AnalyticsProperties, + key: string, +): string | undefined { + const value = properties?.[key] + return typeof value === 'string' && value.trim() ? value : undefined +} + +function getPropertyUserId(properties: AnalyticsProperties): string | undefined { + const direct = + getStringProperty(properties, 'userId') ?? + getStringProperty(properties, 'user_id') ?? + getStringProperty(properties, 'distinct_id') + if (direct) { + return direct + } + + const user = properties?.user + if (user && typeof user === 'object') { + const id = (user as { id?: unknown }).id + return typeof id === 'string' && id.trim() ? id : undefined + } + + return undefined +} + +function splitEnvList(value: string | undefined): Set { + return new Set( + (value ?? '') + .split(',') + .map((item) => item.trim()) + .filter(Boolean), + ) +} + +function isTruthyEnv(value: string | undefined): boolean { + return value === '1' || value === 'true' || value === 'yes' +} + +export function isFullTelemetryEnabled(params: { + distinctId?: string + properties?: AnalyticsProperties +}): boolean { + if (isTruthyEnv(process.env.CODEBUFF_FULL_TELEMETRY)) { + return true + } + + const ids = splitEnvList( + process.env.CODEBUFF_FULL_TELEMETRY_IDS ?? + process.env.CODEBUFF_FULL_TELEMETRY_USER_IDS, + ) + if (ids.size === 0) { + return false + } + + const candidates = [ + params.distinctId, + getPropertyUserId(params.properties), + getStringProperty(params.properties, 'userEmail'), + getStringProperty(params.properties, 'email'), + ].filter( + (value): value is string => + typeof value === 'string' && value.length > 0, + ) + + return candidates.some((candidate) => ids.has(candidate)) +} + +function getEventSampleRate( + event: AnalyticsEvent, + properties: AnalyticsProperties, +): number { + const level = getStringProperty(properties, 'level')?.toLowerCase() + if ( + event === AnalyticsEvent.CLI_LOG && + (level === 'error' || level === 'fatal') + ) { + return 1 + } + + if (ALWAYS_TRACK_EVENTS.has(event)) { + return 1 + } + + return SAMPLED_EVENT_RATES[event] ?? 1 +} + +function hashString(input: string): number { + let hash = 2166136261 + for (let i = 0; i < input.length; i++) { + hash ^= input.charCodeAt(i) + hash = Math.imul(hash, 16777619) + } + return hash >>> 0 +} + +function getSamplingKey(params: { + event: AnalyticsEvent + distinctId?: string + properties?: AnalyticsProperties +}): string { + return ( + params.distinctId ?? + getPropertyUserId(params.properties) ?? + getStringProperty(params.properties, 'clientSessionId') ?? + getStringProperty(params.properties, 'userInputId') ?? + params.event + ) +} + +export function shouldTrackAnalyticsEvent(params: { + event: AnalyticsEvent + distinctId?: string + properties?: AnalyticsProperties +}): boolean { + if (isFullTelemetryEnabled(params)) { + return true + } + + const rate = getEventSampleRate(params.event, params.properties) + if (rate >= 1) { + return true + } + if (rate <= 0) { + return false + } + + const bucket = + hashString(`${params.event}:${getSamplingKey(params)}`) / 0xffffffff + return bucket < rate +} + +function valueKind(value: unknown): string { + if (Array.isArray(value)) { + return 'array' + } + if (value === null) { + return 'null' + } + return typeof value +} + +export function summarizeAnalyticsValue( + value: unknown, +): Record { + if (value === null || value === undefined) { + return { kind: valueKind(value) } + } + + if (typeof value === 'string') { + return { kind: 'string', length: value.length } + } + + if (Array.isArray(value)) { + return { kind: 'array', length: value.length } + } + + if (typeof value === 'object') { + const keys = Object.keys(value as Record) + return { + kind: 'object', + keyCount: keys.length, + keys: keys.slice(0, 25), + } + } + + return { kind: valueKind(value) } +} diff --git a/docs/environment-variables.md b/docs/environment-variables.md index a58b5ed98d..76adde2545 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -6,6 +6,8 @@ - Server secrets: validated in `packages/internal/src/env-schema.ts` (used via `@codebuff/internal/env`). - Runtime/OS env: pass typed snapshots instead of reading `process.env` throughout the codebase. - `IPINFO_TOKEN` is required; free-mode country gating uses it to check IPinfo privacy signals for VPN/proxy/Tor/relay/hosting traffic. +- `CODEBUFF_FULL_TELEMETRY=true` or `CODEBUFF_FULL_TELEMETRY_IDS=user-id,email@example.com` + disables client analytics sampling for targeted debugging. Use sparingly because it can send full CLI log payloads. ## Env DI Helpers diff --git a/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts b/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts index 65c6742d8e..467d97ea52 100644 --- a/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts +++ b/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts @@ -1,8 +1,10 @@ import { TEST_AGENT_RUNTIME_IMPL } from '@codebuff/common/testing/impl/agent-runtime' +import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events' import { promptSuccess } from '@codebuff/common/util/error' import { beforeEach, describe, expect, it } from 'bun:test' import { processStreamWithTools } from '../tool-stream-parser' +import { createToolCallChunk } from './test-utils' import type { AgentRuntimeDeps } from '@codebuff/common/types/contracts/agent-runtime' import type { StreamChunk } from '@codebuff/common/types/contracts/llm' @@ -168,6 +170,44 @@ describe('XML tool result ordering', () => { } }) + it('tracks summarized tool use analytics without raw params or contents', async () => { + const trackedEvents: any[] = [] + + for await (const _chunk of processStreamWithTools({ + ...agentRuntimeImpl, + stream: createMockStream([ + createToolCallChunk('write_file', { + path: 'secret.ts', + content: 'private contents', + }), + ]), + processors: {}, + defaultProcessor: () => ({ onTagStart: () => {}, onTagEnd: () => {} }), + onResponseChunk: () => {}, + executeXmlToolCall: async () => {}, + trackEvent: (event) => { + trackedEvents.push(event) + }, + })) { + // Consume stream + } + + const toolUse = trackedEvents.find( + (event) => event.event === AnalyticsEvent.TOOL_USE, + ) + expect(toolUse).toBeDefined() + expect(toolUse.properties).toMatchObject({ + toolName: 'write_file', + inputType: 'object', + inputKeyCount: 2, + inputKeys: ['path', 'content'], + hasContents: false, + contentsLength: 0, + }) + expect(toolUse.properties.parsedParams).toBeUndefined() + expect(toolUse.properties.contents).toBeUndefined() + }) + it('should not deadlock when executeXmlToolCall awaits tool execution', async () => { // This test verifies that awaiting inside executeXmlToolCall doesn't cause a deadlock. // The fix: pass Promise.resolve() instead of previousToolCallFinished for XML mode, diff --git a/packages/agent-runtime/src/tool-stream-parser.old.ts b/packages/agent-runtime/src/tool-stream-parser.old.ts index e7e07ca433..52e7d4eb8f 100644 --- a/packages/agent-runtime/src/tool-stream-parser.old.ts +++ b/packages/agent-runtime/src/tool-stream-parser.old.ts @@ -22,6 +22,35 @@ const toolExtractionPattern = new RegExp( const completionSuffix = `${JSON.stringify(endsAgentStepParam)}: true\n}${endToolTag}` +function summarizeToolInput(input: unknown): Record { + if (typeof input === 'string') { + return { + inputType: 'string', + inputLength: input.length, + } + } + + if (Array.isArray(input)) { + return { + inputType: 'array', + inputLength: input.length, + } + } + + if (input && typeof input === 'object') { + const keys = Object.keys(input as Record) + return { + inputType: 'object', + inputKeyCount: keys.length, + inputKeys: keys.slice(0, 25), + } + } + + return { + inputType: input === null ? 'null' : typeof input, + } +} + export async function* processStreamWithTags(params: { stream: AsyncGenerator processors: Record< @@ -87,7 +116,7 @@ export async function* processStreamWithTags(params: { event: AnalyticsEvent.MALFORMED_TOOL_CALL_JSON, userId: loggerOptions?.userId ?? '', properties: { - contents: JSON.stringify(contents), + contentsLength: contents.length, model: loggerOptions?.model, agent: loggerOptions?.agentName, error: { @@ -122,7 +151,7 @@ export async function* processStreamWithTags(params: { event: AnalyticsEvent.UNKNOWN_TOOL_CALL, userId: loggerOptions?.userId ?? '', properties: { - contents, + contentsLength: contents.length, toolName, model: loggerOptions?.model, agent: loggerOptions?.agentName, @@ -142,8 +171,9 @@ export async function* processStreamWithTags(params: { userId: loggerOptions?.userId ?? '', properties: { toolName, - contents, - parsedParams, + ...summarizeToolInput(parsedParams), + hasContents: contents.length > 0, + contentsLength: contents.length, autocompleted, model: loggerOptions?.model, agent: loggerOptions?.agentName, diff --git a/packages/agent-runtime/src/tool-stream-parser.ts b/packages/agent-runtime/src/tool-stream-parser.ts index cd4ca58df7..d755d2a2ef 100644 --- a/packages/agent-runtime/src/tool-stream-parser.ts +++ b/packages/agent-runtime/src/tool-stream-parser.ts @@ -16,6 +16,35 @@ import type { } from '@codebuff/common/types/print-mode' import type { PromptResult } from '@codebuff/common/util/error' +function summarizeToolInput(input: unknown): Record { + if (typeof input === 'string') { + return { + inputType: 'string', + inputLength: input.length, + } + } + + if (Array.isArray(input)) { + return { + inputType: 'array', + inputLength: input.length, + } + } + + if (input && typeof input === 'object') { + const keys = Object.keys(input as Record) + return { + inputType: 'object', + inputKeyCount: keys.length, + inputKeys: keys.slice(0, 25), + } + } + + return { + inputType: input === null ? 'null' : typeof input, + } +} + export async function* processStreamWithTools(params: { stream: AsyncGenerator> processors: Record< @@ -96,8 +125,9 @@ export async function* processStreamWithTools(params: { userId: loggerOptions?.userId ?? '', properties: { toolName, - contents, - parsedParams: input, + ...summarizeToolInput(input), + hasContents: typeof contents === 'string' && contents.length > 0, + contentsLength: contents?.length ?? 0, autocompleted, model: loggerOptions?.model, agent: loggerOptions?.agentName, diff --git a/sdk/src/impl/agent-runtime.ts b/sdk/src/impl/agent-runtime.ts index 9c8503d128..17858d8196 100644 --- a/sdk/src/impl/agent-runtime.ts +++ b/sdk/src/impl/agent-runtime.ts @@ -1,6 +1,7 @@ -import { trackEvent } from '@codebuff/common/analytics' +import { trackEvent as trackCommonEvent } from '@codebuff/common/analytics' import { env as clientEnvDefault } from '@codebuff/common/env' import { getCiEnv } from '@codebuff/common/env-ci' +import { shouldTrackAnalyticsEvent } from '@codebuff/common/util/analytics-sampling' import { success } from '@codebuff/common/util/error' import { @@ -19,6 +20,7 @@ import type { import type { DatabaseAgentCache } from '@codebuff/common/types/contracts/database' import type { ClientEnv } from '@codebuff/common/types/contracts/env' import type { Logger } from '@codebuff/common/types/contracts/logger' +import type { TrackEventFn } from '@codebuff/common/types/contracts/analytics' const databaseAgentCache: DatabaseAgentCache = new Map() @@ -51,6 +53,21 @@ export function getAgentRuntimeImpl( sendSubagentChunk, } = params + const trackSdkRuntimeEvent: TrackEventFn = (eventParams) => { + if ( + clientEnv.NEXT_PUBLIC_CB_ENVIRONMENT === 'prod' && + !shouldTrackAnalyticsEvent({ + event: eventParams.event, + distinctId: eventParams.userId, + properties: eventParams.properties, + }) + ) { + return + } + + trackCommonEvent(eventParams) + } + return { // Environment clientEnv, @@ -78,7 +95,7 @@ export function getAgentRuntimeImpl( databaseAgentCache, // Analytics - trackEvent, + trackEvent: trackSdkRuntimeEvent, // Other logger: logger ?? noopLogger, @@ -102,4 +119,4 @@ const noopLogger: Logger = { info: () => {}, warn: () => {}, error: () => {}, -} \ No newline at end of file +}