Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion infra/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -301,8 +301,12 @@ module applicationInsights 'br/public:avm/res/insights/component:0.6.0' = if (en
disableIpMasking: false
flowType: 'Bluefield'
// WAF aligned configuration for Monitoring
// The AVM `insights/component` module wires Application Insights to
// the Log Analytics workspace via `workspaceResourceId` (workspace-
// based App Insights). A separate `diagnosticSettings` entry on the
// SAME workspace causes duplicate ingestion of platform logs.
// Source: AB#37816 — see CKM #811 reference implementation.
workspaceResourceId: enableMonitoring ? logAnalyticsWorkspaceResourceId : ''
diagnosticSettings: enableMonitoring ? [{ workspaceResourceId: logAnalyticsWorkspaceResourceId }] : null
}
}

Expand Down
6 changes: 5 additions & 1 deletion infra/main_custom.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -279,8 +279,12 @@ module applicationInsights 'br/public:avm/res/insights/component:0.6.0' = if (en
disableIpMasking: false
flowType: 'Bluefield'
// WAF aligned configuration for Monitoring
// The AVM `insights/component` module wires Application Insights to
// the Log Analytics workspace via `workspaceResourceId` (workspace-
// based App Insights). A separate `diagnosticSettings` entry on the
// SAME workspace causes duplicate ingestion of platform logs.
// Source: AB#37816 — see CKM #811 reference implementation.
workspaceResourceId: enableMonitoring ? logAnalyticsWorkspaceResourceId : ''
diagnosticSettings: enableMonitoring ? [{ workspaceResourceId: logAnalyticsWorkspaceResourceId }] : null
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/backend-api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@ dependencies = [
"azure-ai-agents==1.2.0b3",
"azure-appconfiguration==1.7.1",
"azure-identity==1.25.0",
"azure-monitor-events-extension==0.1.0",
"azure-monitor-opentelemetry==1.7.0",
"azure-search-documents==11.6.0b12",
"azure-storage-blob==12.26.0",
"azure-storage-queue==12.13.0",
"fastapi[standard]==0.116.1",
"httpx==0.28.1",
"opentelemetry-instrumentation-fastapi==0.57b0",
"pydantic-settings==2.10.1",
"python-dotenv",
"python-multipart==0.0.22",
Expand Down
14 changes: 14 additions & 0 deletions src/backend-api/src/app/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,17 @@ APP_CONFIGURATION_URL=""
# AZURE_PACKAGE_LOGGING_LEVEL="WARNING" # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
# AZURE_LOGGING_PACKAGES="azure.core.pipeline.policies.http_logging_policy,azure.storage.blob,azure.storage.queue,azure.core,azure.identity,azure.storage,azure.core.pipeline,azure.core.pipeline.policies,azure.core.pipeline.transport,openai,openai._client,httpx,httpcore,semantic_kernel,urllib3,msal"

# ------------------------------------------------------------------
# Application Insights / OpenTelemetry
# ------------------------------------------------------------------
# When deployed via the bundled Bicep, this value is injected by
# `infra/main.bicep` -> `containerAppBackend` from the
# `applicationInsights.outputs.connectionString` output. Leave unset for
# local dev to skip telemetry export entirely (the app will log a single
# warning at startup and otherwise behave normally).
# APPLICATIONINSIGHTS_CONNECTION_STRING=""

# Optional: clamp basic logging level for App Insights ingestion.
# Defaults inherit from APP_LOGGING_LEVEL above.
# AZURE_BASIC_LOGGING_LEVEL="INFO"

96 changes: 96 additions & 0 deletions src/backend-api/src/app/application.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import logging
import os
from datetime import datetime

from fastapi.middleware.cors import CORSMiddleware
from libs.base.application_base import Application_Base
from libs.base.typed_fastapi import TypedFastAPI
from libs.logging.span_filters import (
DropASGIResponseBodySpanProcessor,
DropCosmosDependencySpanProcessor,
)
from libs.repositories.file_repository import FileRepository
from libs.repositories.process_repository import ProcessRepository
from libs.repositories.process_status_repository import ProcessStatusRepository
Expand All @@ -20,6 +25,14 @@
from routers import router_debug, router_files, router_process
from routers.http_probes import router as http_probes

logger = logging.getLogger(__name__)

# URLs (relative paths) that should NOT generate request telemetry.
# Matches the routes registered in `routers/http_probes.py`.
# `excluded_urls` is a comma-separated substring list per the
# OpenTelemetry FastAPI instrumentation contract.
_OTEL_EXCLUDED_URLS = "health,startup"


class Application(Application_Base):
"""
Expand Down Expand Up @@ -57,11 +70,94 @@ def initialize(self):
allow_headers=["*"],
)

# Wire up Azure Monitor / OpenTelemetry BEFORE the routers are
# included so that the FastAPI instrumentor can patch the app
# while the route table is still empty. `configure_azure_monitor`
# is a no-op (warns once) when the connection string env var is
# absent — see `libs/logging/event_utils.py`.
self._configure_azure_monitor()

self.app.include_router(http_probes)
self._register_dependencies()
self._config_routers()
# self._initialize_database()

# Instrumenting AFTER routers are registered means every route is
# automatically wrapped by the OTEL middleware.
self._instrument_fastapi()

def _configure_azure_monitor(self):
"""Initialise Azure Monitor OpenTelemetry exporter, if configured.

This is the App Insights "linkage" step required by AC #1 / AC #2:
if `APPLICATIONINSIGHTS_CONNECTION_STRING` is set in the
container app environment (wired by Bicep — see
`infra/main.bicep`), we hand it to `configure_azure_monitor`
along with our two noise-suppressing span processors.

Live Metrics is enabled so the team can watch the Maintenance
environment in real time during demo validation.
"""
connection_string = os.environ.get(
"APPLICATIONINSIGHTS_CONNECTION_STRING", ""
).strip()
if not connection_string:
logger.info(
"APPLICATIONINSIGHTS_CONNECTION_STRING not set; "
"skipping Azure Monitor OpenTelemetry configuration."
)
return

try:
from azure.monitor.opentelemetry import configure_azure_monitor

configure_azure_monitor(
connection_string=connection_string,
enable_live_metrics=True,
span_processors=[
DropASGIResponseBodySpanProcessor(),
DropCosmosDependencySpanProcessor(),
],
)
# Do NOT log the connection string itself — it contains the
# ingestion key. Logging only the fact of configuration.
logger.info(
"Azure Monitor OpenTelemetry configured (live metrics enabled)."
)
except Exception: # noqa: BLE001 — telemetry must never break startup
logger.exception(
"Failed to configure Azure Monitor OpenTelemetry; "
"continuing without App Insights export."
)

def _instrument_fastapi(self):
"""Apply the OpenTelemetry FastAPI instrumentation.

Excludes the liveness / startup probe routes (registered in
`routers/http_probes.py`) so probe traffic does not flood
Application Insights with no-information request rows.
"""
if not os.environ.get(
"APPLICATIONINSIGHTS_CONNECTION_STRING", ""
).strip():
# No exporter wired up; instrumenting is a wasted import.
return
try:
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor

FastAPIInstrumentor.instrument_app(
self.app, excluded_urls=_OTEL_EXCLUDED_URLS
)
logger.info(
"FastAPIInstrumentor attached (excluded_urls=%s).",
_OTEL_EXCLUDED_URLS,
)
except Exception: # noqa: BLE001
logger.exception(
"Failed to attach FastAPIInstrumentor; "
"continuing without per-request telemetry."
)

def _config_routers(self):
"""
Configure routers for the FastAPI application.
Expand Down
24 changes: 24 additions & 0 deletions src/backend-api/src/app/libs/base/application_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@
from libs.application.application_context import AppContext
from libs.azure.app_configuration import AppConfigurationHelper

# Logger packages that emit at INFO/DEBUG levels often enough to drown
# out signal in Application Insights logs when the OTEL log handler is
# attached. We force them to WARNING regardless of caller config.
# This is a backstop in addition to (not a replacement for) the
# `AZURE_LOGGING_PACKAGES` env-var driven filtering — anything listed
# here is ALWAYS clamped to WARNING.
_NOISY_LOGGER_PACKAGES = (
"azure.core.pipeline.policies.http_logging_policy",
"azure.cosmos",
"opentelemetry.sdk",
"azure.monitor.opentelemetry.exporter.export._base",
)


class Application_Base(ABC):
application_context: AppContext = None
Expand Down Expand Up @@ -78,6 +91,17 @@ def __init__(self, env_file_path: str | None = None, **data):
):
logging.getLogger(logger_name).setLevel(azure_level)

# Hard-suppress known noisy packages regardless of operator
# config. Without this, the App Insights logs view is
# dominated by per-request HTTP policy logs and per-call
# Cosmos diagnostics — see AC #3 / AC #4 of AB#37816.
# We never lower a logger that the operator has explicitly
# raised below WARNING.
for noisy_pkg in _NOISY_LOGGER_PACKAGES:
noisy_logger = logging.getLogger(noisy_pkg)
if noisy_logger.level == logging.NOTSET or noisy_logger.level < logging.WARNING:
noisy_logger.setLevel(logging.WARNING)

# Initialize the application
self.initialize()

Expand Down
23 changes: 23 additions & 0 deletions src/backend-api/src/app/libs/logging/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Application-level logging and telemetry helpers.

This subpackage hosts the small Application Insights / OpenTelemetry
integration helpers used by the backend-api application:

- ``event_utils`` — a tiny wrapper around
``azure.monitor.events.extension.track_event`` that no-ops when the
``APPLICATIONINSIGHTS_CONNECTION_STRING`` environment variable is not
configured. Callers can therefore emit structured events from
any router/service without conditionally guarding each call site.
- ``span_filters`` — custom OpenTelemetry ``SpanProcessor`` implementations
that drop noisy spans before they are exported to Application Insights
(per-chunk ASGI ``http.response.body`` spans and Cosmos DB dependency
spans). These keep the App Insights ingestion cost and the
end-to-end transaction view clean for the Container Migration workflow.

Nothing in this subpackage imports Azure SDKs at module-import time, so
it is safe to import from contexts where the App Insights SDK may not be
fully wired up yet (e.g. application bootstrap before
``configure_azure_monitor`` has run).
"""
132 changes: 132 additions & 0 deletions src/backend-api/src/app/libs/logging/event_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Lightweight helpers for emitting Application Insights *custom events*.

The backend-api routers want to emit structured events such as
``"UploadFilesSuccess"`` / ``"UploadFilesError"`` for business-level
observability — independent of whatever distributed-tracing spans the
OpenTelemetry instrumentation produces. ``azure-monitor-events-extension``
provides ``track_event`` for exactly this purpose, but two practical
problems show up in production:

1. ``track_event`` raises (or warns repeatedly) when called before
``configure_azure_monitor`` has been invoked — for example in unit
tests or in local dev runs where ``APPLICATIONINSIGHTS_CONNECTION_STRING``
is intentionally unset.
2. Importing ``azure.monitor.events.extension`` eagerly at module
import time slows cold-start and pulls in the OTEL log SDK even for
code paths that never emit a custom event.

``track_event_if_configured`` solves both problems. It:

* short-circuits when the connection string env var is empty/unset and
emits a single warning the first time it is called (subsequent
unconfigured calls are silent — see ``_warned_unconfigured`` below);
* lazily imports ``azure.monitor.events.extension`` only on the first
configured call;
* swallows and logs export failures so that telemetry problems can
never break a request.

The function is deliberately small and side-effect-free outside the
optional Application Insights call. Tests can therefore verify both
the gating behaviour and the export behaviour without standing up a
real OpenTelemetry pipeline.
"""
from __future__ import annotations

import logging
import os
from typing import Any, Mapping

logger = logging.getLogger(__name__)

# Environment variable that ``configure_azure_monitor`` keys off and that
# we use as the single source of truth for "is App Insights configured".
APP_INSIGHTS_CONN_STRING_ENV = "APPLICATIONINSIGHTS_CONNECTION_STRING"

# Public message constant so tests can assert on the wording without
# duplicating the string. We deliberately do not include the env-var
# value in any log message — see hard-constraint #8 (never echo secrets).
_UNCONFIGURED_WARNING = (
"APPLICATIONINSIGHTS_CONNECTION_STRING is not set; "
"track_event_if_configured(name=%s) is a no-op."
)

# Module-level latch so we warn at most once per process when the
# connection string is missing. Reset by tests via
# ``reset_unconfigured_warning_for_tests``.
_warned_unconfigured: bool = False


def _is_app_insights_configured() -> bool:
"""Return True iff the App Insights connection string is non-empty.

Reading the environment on every call (rather than caching at import
time) is intentional: ``Application_Base.__init__`` may load the
``.env`` file or pull values from Azure App Configuration *after*
this module has been imported.
"""
value = os.environ.get(APP_INSIGHTS_CONN_STRING_ENV)
return bool(value and value.strip())


def reset_unconfigured_warning_for_tests() -> None:
"""Test-only helper: reset the once-per-process warning latch."""
global _warned_unconfigured
_warned_unconfigured = False


def track_event_if_configured(
name: str, properties: Mapping[str, Any] | None = None
) -> None:
"""Emit an Application Insights custom event, gated on configuration.

Parameters
----------
name:
Event name as it should appear in the App Insights ``customEvents``
table. Use ``PascalCase`` for consistency with the rest of the
product (e.g. ``"UploadFilesSuccess"``, ``"StartProcessingError"``).
properties:
Optional mapping of string keys to JSON-serialisable values that
will land in ``customDimensions``. ``None`` is normalised to an
empty dict before being forwarded.

Behaviour
---------
* If ``APPLICATIONINSIGHTS_CONNECTION_STRING`` is unset or empty,
this is a no-op. A single warning is logged the first time this
occurs in the process; subsequent calls are silent.
* If the env var is set, ``azure.monitor.events.extension.track_event``
is invoked. Any exception raised during export is caught and logged
at ``WARNING`` level — telemetry must never break a request.
"""
global _warned_unconfigured

# Fast-path: missing connection string -> no-op (with a one-shot warning).
if not _is_app_insights_configured():
if not _warned_unconfigured:
logger.warning(_UNCONFIGURED_WARNING, name)
_warned_unconfigured = True
return

safe_properties: dict[str, Any] = dict(properties) if properties else {}

try:
# Lazy import: keeps cold-start cheap and lets unit tests patch
# the symbol via ``monkeypatch.setattr`` on this module.
from azure.monitor.events.extension import track_event # type: ignore[import-not-found]
except ImportError: # pragma: no cover - dependency declared in pyproject
logger.warning(
"azure-monitor-events-extension is not installed; "
"skipping track_event(name=%s).",
name,
)
return

try:
track_event(name, safe_properties)
except Exception: # noqa: BLE001 — telemetry must never break a request
logger.exception(
"Failed to publish App Insights custom event name=%s.", name
)
Loading
Loading