From d55d03b6c18b75bcd9381d22e7470df82b5be080 Mon Sep 17 00:00:00 2001 From: Ari Angelo Date: Tue, 12 May 2026 10:01:32 +0200 Subject: [PATCH 1/3] =?UTF-8?q?feat(ci):=20split=20hourly=20heartbeat=20by?= =?UTF-8?q?=20concern=20=E2=80=94=20SDK=20vs=20HE-TME?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add @pytest.mark.monitors("he-tme") marker to tag tests that exercise the HE-TME application end-to-end. Tests without the marker are SDK health checks (auth, listing, connectivity). The hourly scheduled workflow now runs two separate pytest invocations and sends two independent Better Stack heartbeats so an HE-TME outage no longer pollutes the SDK monitor and vice versa. The HE-TME heartbeat URLs are optional secrets; the step skips gracefully until the monitors are created in Better Stack. --- .github/workflows/_scheduled-test-hourly.yml | 219 +++++++++++++----- .../scheduled-testing-production-hourly.yml | 2 + .../scheduled-testing-staging-hourly.yml | 2 + pyproject.toml | 1 + tests/aignostics/platform/e2e_test.py | 3 + 5 files changed, 175 insertions(+), 52 deletions(-) diff --git a/.github/workflows/_scheduled-test-hourly.yml b/.github/workflows/_scheduled-test-hourly.yml index 939366198..dc0be9f2f 100644 --- a/.github/workflows/_scheduled-test-hourly.yml +++ b/.github/workflows/_scheduled-test-hourly.yml @@ -22,6 +22,9 @@ on: required: true BETTERSTACK_HEARTBEAT_URL_STAGING: required: true + BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING: + # Optional until the HE-TME Better Stack monitor is created; the heartbeat step skips gracefully when absent. + required: false AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION: required: true AIGNOSTICS_REFRESH_TOKEN_PRODUCTION: @@ -30,6 +33,9 @@ on: required: true BETTERSTACK_HEARTBEAT_URL_PRODUCTION: required: true + BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION: + # Optional until the HE-TME Better Stack monitor is created; the heartbeat step skips gracefully when absent. + required: false SENTRY_DSN: required: true @@ -90,18 +96,59 @@ jobs: echo "$GCP_CREDENTIALS" | base64 -d > credentials.json echo "GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/credentials.json" >> $GITHUB_ENV - - name: Test / scheduled + # Tests are split into two runs so failures can be routed to the correct Better Stack monitor: + # SDK-layer failures (auth, listing, connectivity) go to the SDK monitor; HE-TME application + # failures go to the HE-TME monitor. Tests declare which system they monitor via + # @pytest.mark.monitors("he-tme"); tests without that marker are SDK health checks. + - name: Test / scheduled / sdk + id: test_sdk + env: + SENTRY_DSN: ${{ secrets.SENTRY_DSN }} + shell: bash + run: | + # set +e so a test failure does not abort the step — we capture the exit code + # manually and send it to Better Stack regardless of outcome. + set +e + XDIST_WORKER_FACTOR=1 uv run --all-extras nox -s test -- \ + -m "(scheduled or scheduled_only) and not monitors and not stress_only" \ + --junit-xml=reports/junit_sdk.xml + echo "exit_code=$?" >> $GITHUB_OUTPUT + + - name: Test / scheduled / he-tme + id: test_he_tme env: - BETTERSTACK_HEARTBEAT_URL: "${{ inputs.platform_environment == 'staging' && secrets.BETTERSTACK_HEARTBEAT_URL_STAGING || secrets.BETTERSTACK_HEARTBEAT_URL_PRODUCTION }}" SENTRY_DSN: ${{ secrets.SENTRY_DSN }} shell: bash run: | + # set +e so a test failure does not abort the step — we capture the exit code + # manually and send it to Better Stack regardless of outcome. set +e - make test_scheduled - EXIT_CODE=$? + XDIST_WORKER_FACTOR=1 uv run --all-extras nox -s test -- \ + -m "(scheduled or scheduled_only) and monitors and not stress_only" \ + --junit-xml=reports/junit_he_tme.xml + echo "exit_code=$?" >> $GITHUB_OUTPUT + + - name: Collect combined exit code and publish summary + id: collect + shell: bash + run: | + # Default to 1 (failure) if a step never wrote its output — e.g. a setup step + # errored before tests ran. Prevents sending a false "healthy" heartbeat. + SDK_EXIT=${{ steps.test_sdk.outputs.exit_code || '1' }} + HE_TME_EXIT=${{ steps.test_he_tme.outputs.exit_code || '1' }} + # Combined exit code: non-zero if either run failed + if [ "$SDK_EXIT" != "0" ] || [ "$HE_TME_EXIT" != "0" ]; then + COMBINED_EXIT=1 + else + COMBINED_EXIT=0 + fi + echo "sdk_exit=${SDK_EXIT}" >> $GITHUB_OUTPUT + echo "he_tme_exit=${HE_TME_EXIT}" >> $GITHUB_OUTPUT + echo "combined_exit=${COMBINED_EXIT}" >> $GITHUB_OUTPUT + # Show test execution in GitHub Job summary found_files=0 - for file in reports/pytest_*.md; do + for file in reports/pytest_*.md reports/pytest.md; do if [ -f "$file" ]; then cat "$file" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY @@ -121,53 +168,116 @@ jobs: echo "" >> $GITHUB_STEP_SUMMARY fi - # Send heartbeat to Sentry, defining the schedule on the fly - SENTRY_EXIT_CODE=$(sentry-cli monitors run -e CI --schedule "0 * * * *" --check-in-margin 30 --max-runtime 1 scheduled-testing-${{ inputs.platform_environment }}-hourly --timezone "Europe/Berlin" -- sh -c "exit $EXIT_CODE") - - # Provide heartbeat to BetterStack for monitoring/alerting if heartbeat url is configured as secret - if [ -n "$BETTERSTACK_HEARTBEAT_URL" ]; then - BETTERSTACK_METADATA_PAYLOAD=$(jq -n \ - --arg github_workflow "${{ github.workflow }}" \ - --arg github_run_url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ - --arg github_run_id "${{ github.run_id }}" \ - --arg github_job "${{ github.job }}" \ - --arg github_sha "${{ github.sha }}" \ - --arg github_actor "${{ github.actor }}" \ - --arg github_repository "${{ github.repository }}" \ - --arg github_ref "${{ github.ref }}" \ - --arg job_status "${{ job.status }}" \ - --arg github_event_name "${{ github.event_name }}" \ - --arg timestamp "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" \ - '{ - github: { - workflow: $github_workflow, - run_url: $github_run_url, - run_id: $github_run_id, - job: $github_job, - sha: $github_sha, - actor: $github_actor, - repository: $github_repository, - ref: $github_ref, - event_name: $github_event_name - }, - job: { - status: $job_status, - }, - timestamp: $timestamp, - }' - ) - curl \ - --fail-with-body \ - --silent \ - --request POST \ - --header "Content-Type: application/json" \ - --data-binary "${BETTERSTACK_METADATA_PAYLOAD}" \ - "${BETTERSTACK_HEARTBEAT_URL}/${EXIT_CODE}" - echo "INFO: Sent heartbeat to betterstack with exit code '${EXIT_CODE}'" - else - echo "WARNING: No BetterStack heartbeat URL configured, skipped heartbeat notification." + - name: Heartbeat / Sentry + if: always() + shell: bash + env: + SENTRY_DSN: ${{ secrets.SENTRY_DSN }} + COMBINED_EXIT: ${{ steps.collect.outputs.combined_exit }} + run: | + sentry-cli monitors run -e CI --schedule "0 * * * *" --check-in-margin 30 --max-runtime 1 \ + scheduled-testing-${{ inputs.platform_environment }}-hourly \ + --timezone "Europe/Berlin" -- sh -c "exit ${COMBINED_EXIT}" || true + + - name: Heartbeat / BetterStack / SDK + if: always() + shell: bash + env: + BETTERSTACK_HEARTBEAT_URL: "${{ inputs.platform_environment == 'staging' && secrets.BETTERSTACK_HEARTBEAT_URL_STAGING || secrets.BETTERSTACK_HEARTBEAT_URL_PRODUCTION }}" + SDK_EXIT: ${{ steps.collect.outputs.sdk_exit }} + run: | + if [ -z "$BETTERSTACK_HEARTBEAT_URL" ]; then + echo "WARNING: No BetterStack SDK heartbeat URL configured, skipped." + exit 0 + fi + BETTERSTACK_METADATA_PAYLOAD=$(jq -n \ + --arg github_workflow "${{ github.workflow }}" \ + --arg github_run_url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ + --arg github_run_id "${{ github.run_id }}" \ + --arg github_job "${{ github.job }}" \ + --arg github_sha "${{ github.sha }}" \ + --arg github_actor "${{ github.actor }}" \ + --arg github_repository "${{ github.repository }}" \ + --arg github_ref "${{ github.ref }}" \ + --arg job_status "${{ job.status }}" \ + --arg github_event_name "${{ github.event_name }}" \ + --arg timestamp "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" \ + '{ + github: { + workflow: $github_workflow, + run_url: $github_run_url, + run_id: $github_run_id, + job: $github_job, + sha: $github_sha, + actor: $github_actor, + repository: $github_repository, + ref: $github_ref, + event_name: $github_event_name + }, + job: { + status: $job_status, + }, + timestamp: $timestamp, + }' + ) + curl \ + --fail-with-body \ + --silent \ + --request POST \ + --header "Content-Type: application/json" \ + --data-binary "${BETTERSTACK_METADATA_PAYLOAD}" \ + "${BETTERSTACK_HEARTBEAT_URL}/${SDK_EXIT}" + echo "INFO: Sent SDK heartbeat to BetterStack with exit code '${SDK_EXIT}'" + + - name: Heartbeat / BetterStack / HE-TME + if: always() + shell: bash + env: + BETTERSTACK_HEARTBEAT_URL_HE_TME: "${{ inputs.platform_environment == 'staging' && secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING || secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION }}" + HE_TME_EXIT: ${{ steps.collect.outputs.he_tme_exit }} + run: | + if [ -z "$BETTERSTACK_HEARTBEAT_URL_HE_TME" ]; then + echo "INFO: No BetterStack HE-TME heartbeat URL configured, skipped." + exit 0 fi - exit $EXIT_CODE + BETTERSTACK_METADATA_PAYLOAD=$(jq -n \ + --arg github_workflow "${{ github.workflow }}" \ + --arg github_run_url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ + --arg github_run_id "${{ github.run_id }}" \ + --arg github_job "${{ github.job }}" \ + --arg github_sha "${{ github.sha }}" \ + --arg github_actor "${{ github.actor }}" \ + --arg github_repository "${{ github.repository }}" \ + --arg github_ref "${{ github.ref }}" \ + --arg job_status "${{ job.status }}" \ + --arg github_event_name "${{ github.event_name }}" \ + --arg timestamp "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" \ + '{ + github: { + workflow: $github_workflow, + run_url: $github_run_url, + run_id: $github_run_id, + job: $github_job, + sha: $github_sha, + actor: $github_actor, + repository: $github_repository, + ref: $github_ref, + event_name: $github_event_name + }, + job: { + status: $job_status, + }, + timestamp: $timestamp, + }' + ) + curl \ + --fail-with-body \ + --silent \ + --request POST \ + --header "Content-Type: application/json" \ + --data-binary "${BETTERSTACK_METADATA_PAYLOAD}" \ + "${BETTERSTACK_HEARTBEAT_URL_HE_TME}/${HE_TME_EXIT}" + echo "INFO: Sent HE-TME heartbeat to BetterStack with exit code '${HE_TME_EXIT}'" - name: Upload test results uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -175,9 +285,14 @@ jobs: with: name: test-results-scheduled path: | - reports/junit.xml + reports/junit_sdk.xml + reports/junit_he_tme.xml reports/coverage.xml reports/coverage.md reports/coverage_html aignostics.log retention-days: 7 + + - name: Fail job if any tests failed + shell: bash + run: exit ${{ steps.collect.outputs.combined_exit }} diff --git a/.github/workflows/scheduled-testing-production-hourly.yml b/.github/workflows/scheduled-testing-production-hourly.yml index 4dd960dee..b991c4923 100644 --- a/.github/workflows/scheduled-testing-production-hourly.yml +++ b/.github/workflows/scheduled-testing-production-hourly.yml @@ -24,8 +24,10 @@ jobs: AIGNOSTICS_REFRESH_TOKEN_STAGING: ${{ secrets.AIGNOSTICS_REFRESH_TOKEN_STAGING }} GCP_CREDENTIALS_STAGING: ${{ secrets.GCP_CREDENTIALS_STAGING }} BETTERSTACK_HEARTBEAT_URL_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_STAGING }} + BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING }} AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION: ${{ secrets.AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION }} AIGNOSTICS_REFRESH_TOKEN_PRODUCTION: ${{ secrets.AIGNOSTICS_REFRESH_TOKEN_PRODUCTION }} GCP_CREDENTIALS_PRODUCTION: ${{ secrets.GCP_CREDENTIALS_PRODUCTION }} BETTERSTACK_HEARTBEAT_URL_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_PRODUCTION }} + BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION }} SENTRY_DSN: ${{ secrets.SENTRY_DSN }} # For metrics and heartbeat diff --git a/.github/workflows/scheduled-testing-staging-hourly.yml b/.github/workflows/scheduled-testing-staging-hourly.yml index 878b8106e..7f8286210 100644 --- a/.github/workflows/scheduled-testing-staging-hourly.yml +++ b/.github/workflows/scheduled-testing-staging-hourly.yml @@ -24,8 +24,10 @@ jobs: AIGNOSTICS_REFRESH_TOKEN_STAGING: ${{ secrets.AIGNOSTICS_REFRESH_TOKEN_STAGING }} GCP_CREDENTIALS_STAGING: ${{ secrets.GCP_CREDENTIALS_STAGING }} BETTERSTACK_HEARTBEAT_URL_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_STAGING }} + BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING }} AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION: ${{ secrets.AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION }} AIGNOSTICS_REFRESH_TOKEN_PRODUCTION: ${{ secrets.AIGNOSTICS_REFRESH_TOKEN_PRODUCTION }} GCP_CREDENTIALS_PRODUCTION: ${{ secrets.GCP_CREDENTIALS_PRODUCTION }} BETTERSTACK_HEARTBEAT_URL_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_PRODUCTION }} + BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION }} SENTRY_DSN: ${{ secrets.SENTRY_DSN }} # For metrics and heartbeat diff --git a/pyproject.toml b/pyproject.toml index e456a9c70..62c181245 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -383,6 +383,7 @@ markers = [ "unit: Solitary unit tests - test a layer of a module in isolation with all dependencies mocked, except interaction with shared utils and the systems module. Unit tests must be able to pass offline, i.e. not calls to external services. The timeout should not be bigger than the default 10s, and must be <5 min.", "integration: Sociable integration tests - test interactions across architectural layers (e.g. CLI/GUI→Service, Service→Utils) or between modules (e.g. Application→Platform), using real SDK collaborators, real file I/O, real subprocesses, and real Docker containers. Integration test must be able to pass offline, i.e. mock external services (Aignostics Platform API, Auth0, S3/GCS buckets, IDC). The timeout should not be bigger than the default 10s, and must be <5 min.", "e2e: End-to-end tests - test complete workflows with real external network services (Aignostics Platform API, cloud storage, IDC, etc). If the test timeout is >= 5 min and < 60 min, additionally mark as `long_running`, if >= 60min mark as 'very_long_running'.", + "monitors: Tag a scheduled test with the application it monitors, e.g. @pytest.mark.monitors('he-tme'). Tests without this marker are considered SDK-layer health checks. Used to route Better Stack heartbeats to the correct monitor.", ] md_report = true md_report_output = "reports/pytest.md" diff --git a/tests/aignostics/platform/e2e_test.py b/tests/aignostics/platform/e2e_test.py index 634e85da0..fb0716eb6 100644 --- a/tests/aignostics/platform/e2e_test.py +++ b/tests/aignostics/platform/e2e_test.py @@ -529,6 +529,7 @@ def test_platform_test_app_submit_and_wait(record_property) -> None: @pytest.mark.e2e @pytest.mark.long_running @pytest.mark.scheduled_only +@pytest.mark.monitors("he-tme") @pytest.mark.timeout(timeout=HETA_APPLICATION_SUBMIT_AND_WAIT_TIMEOUT_SECONDS + 60 * 5) def test_platform_heta_app_submit_and_wait(record_property) -> None: """Test application runs with the HETA application. @@ -598,6 +599,7 @@ def test_platform_test_app_find_and_validate() -> None: @pytest.mark.e2e @pytest.mark.scheduled_only +@pytest.mark.monitors("he-tme") @pytest.mark.timeout(timeout=HETA_APPLICATION_SUBMIT_AND_FIND_SUBMIT_TIMEOUT_SECONDS) def test_platform_heta_app_submit() -> None: """Test application runs with the HETA application. @@ -709,6 +711,7 @@ def test_platform_special_app_find_and_validate() -> None: @pytest.mark.e2e @pytest.mark.long_running @pytest.mark.scheduled_only +@pytest.mark.monitors("he-tme") @pytest.mark.timeout(timeout=HETA_APPLICATION_FIND_AND_VALIDATE_TIMEOUT_SECONDS) def test_platform_heta_app_find_and_validate() -> None: """Test application runs with the HETA application. From d7d10bc84cbb8172a8b0753008887c3db53e9314 Mon Sep 17 00:00:00 2001 From: Ari Angelo Date: Tue, 12 May 2026 10:17:39 +0200 Subject: [PATCH 2/3] test(e2e): add monitors("test-app") tag to test-app scheduled tests Extends the monitors marker to the three test-app e2e tests so they route to a test-app Better Stack monitor independently from the SDK health checks. --- tests/aignostics/platform/e2e_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/aignostics/platform/e2e_test.py b/tests/aignostics/platform/e2e_test.py index fb0716eb6..3b182d0ac 100644 --- a/tests/aignostics/platform/e2e_test.py +++ b/tests/aignostics/platform/e2e_test.py @@ -500,6 +500,7 @@ def _find_and_validate( @pytest.mark.e2e @pytest.mark.very_long_running @pytest.mark.scheduled_only +@pytest.mark.monitors("test-app") @pytest.mark.timeout(timeout=TEST_APPLICATION_SUBMIT_AND_WAIT_TIMEOUT_SECONDS + 60 * 5) def test_platform_test_app_submit_and_wait(record_property) -> None: """Test application runs with the test application. @@ -558,6 +559,7 @@ def test_platform_heta_app_submit_and_wait(record_property) -> None: @pytest.mark.skip(reason="Using submit and wait approach") @pytest.mark.e2e +@pytest.mark.monitors("test-app") @pytest.mark.timeout(timeout=TEST_APPLICATION_SUBMIT_AND_FIND_SUBMIT_TIMEOUT_SECONDS) def test_platform_test_app_submit() -> None: """Test application submission with the test application. @@ -581,6 +583,7 @@ def test_platform_test_app_submit() -> None: @pytest.mark.e2e @pytest.mark.scheduled_only +@pytest.mark.monitors("test-app") @pytest.mark.timeout(timeout=TEST_APPLICATION_FIND_AND_VALIDATE_TIMEOUT_SECONDS) def test_platform_test_app_find_and_validate() -> None: """Test application runs with the test application. From 0683245e33a749e5c85919f241c79db766b710a4 Mon Sep 17 00:00:00 2001 From: Ari Angelo Date: Tue, 12 May 2026 10:45:35 +0200 Subject: [PATCH 3/3] =?UTF-8?q?chore(tests):=20three-way=20Better=20Stack?= =?UTF-8?q?=20split=20=E2=80=94=20SDK=20/=20Platform=20API=20/=20applicati?= =?UTF-8?q?ons?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests are now routed to three separate Better Stack heartbeat monitors: - SDK (no monitors marker): token management, service wiring - Platform API (monitors_platform_api): auth, listing, connectivity - test_cli_health_json (system) - test_cli_application_list_verbose (application) - test_cli_run_list_limit_10 (application) - HE-TME / applications (monitors): application processing - existing he-tme and test-app tests (unchanged) Adds monitors_platform_api pytest marker alongside the existing monitors("platform-api") string for pytest -m expression filtering. Adds BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_* secrets (optional, heartbeat step skips gracefully when absent). --- .github/workflows/_scheduled-test-hourly.yml | 91 +++++++++++++++++-- .../scheduled-testing-production-hourly.yml | 2 + .../scheduled-testing-staging-hourly.yml | 2 + pyproject.toml | 1 + tests/aignostics/application/cli_test.py | 4 + tests/aignostics/system/cli_test.py | 2 + 6 files changed, 94 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_scheduled-test-hourly.yml b/.github/workflows/_scheduled-test-hourly.yml index dc0be9f2f..69b627529 100644 --- a/.github/workflows/_scheduled-test-hourly.yml +++ b/.github/workflows/_scheduled-test-hourly.yml @@ -25,6 +25,9 @@ on: BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING: # Optional until the HE-TME Better Stack monitor is created; the heartbeat step skips gracefully when absent. required: false + BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_STAGING: + # Optional until the Platform API Better Stack monitor is created; the heartbeat step skips gracefully when absent. + required: false AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION: required: true AIGNOSTICS_REFRESH_TOKEN_PRODUCTION: @@ -36,6 +39,9 @@ on: BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION: # Optional until the HE-TME Better Stack monitor is created; the heartbeat step skips gracefully when absent. required: false + BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_PRODUCTION: + # Optional until the Platform API Better Stack monitor is created; the heartbeat step skips gracefully when absent. + required: false SENTRY_DSN: required: true @@ -96,10 +102,12 @@ jobs: echo "$GCP_CREDENTIALS" | base64 -d > credentials.json echo "GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/credentials.json" >> $GITHUB_ENV - # Tests are split into two runs so failures can be routed to the correct Better Stack monitor: - # SDK-layer failures (auth, listing, connectivity) go to the SDK monitor; HE-TME application - # failures go to the HE-TME monitor. Tests declare which system they monitor via - # @pytest.mark.monitors("he-tme"); tests without that marker are SDK health checks. + # Tests are split into three runs so failures can be routed to the correct Better Stack monitor: + # - SDK-layer failures (token management, service wiring): no monitors marker → SDK monitor + # - Platform API failures (auth, listing, connectivity): monitors_platform_api → Platform API monitor + # - Application failures (HE-TME, test-app processing): monitors("he-tme"/"test-app") → HE-TME monitor + # Tests declare which system they monitor via @pytest.mark.monitors("he-tme") / + # @pytest.mark.monitors_platform_api; tests with neither marker are SDK health checks. - name: Test / scheduled / sdk id: test_sdk env: @@ -110,10 +118,24 @@ jobs: # manually and send it to Better Stack regardless of outcome. set +e XDIST_WORKER_FACTOR=1 uv run --all-extras nox -s test -- \ - -m "(scheduled or scheduled_only) and not monitors and not stress_only" \ + -m "(scheduled or scheduled_only) and not monitors and not monitors_platform_api and not stress_only" \ --junit-xml=reports/junit_sdk.xml echo "exit_code=$?" >> $GITHUB_OUTPUT + - name: Test / scheduled / platform-api + id: test_platform_api + env: + SENTRY_DSN: ${{ secrets.SENTRY_DSN }} + shell: bash + run: | + # set +e so a test failure does not abort the step — we capture the exit code + # manually and send it to Better Stack regardless of outcome. + set +e + XDIST_WORKER_FACTOR=1 uv run --all-extras nox -s test -- \ + -m "(scheduled or scheduled_only) and monitors_platform_api and not stress_only" \ + --junit-xml=reports/junit_platform_api.xml + echo "exit_code=$?" >> $GITHUB_OUTPUT + - name: Test / scheduled / he-tme id: test_he_tme env: @@ -124,7 +146,7 @@ jobs: # manually and send it to Better Stack regardless of outcome. set +e XDIST_WORKER_FACTOR=1 uv run --all-extras nox -s test -- \ - -m "(scheduled or scheduled_only) and monitors and not stress_only" \ + -m "(scheduled or scheduled_only) and monitors and not monitors_platform_api and not stress_only" \ --junit-xml=reports/junit_he_tme.xml echo "exit_code=$?" >> $GITHUB_OUTPUT @@ -135,14 +157,16 @@ jobs: # Default to 1 (failure) if a step never wrote its output — e.g. a setup step # errored before tests ran. Prevents sending a false "healthy" heartbeat. SDK_EXIT=${{ steps.test_sdk.outputs.exit_code || '1' }} + PLATFORM_API_EXIT=${{ steps.test_platform_api.outputs.exit_code || '1' }} HE_TME_EXIT=${{ steps.test_he_tme.outputs.exit_code || '1' }} - # Combined exit code: non-zero if either run failed - if [ "$SDK_EXIT" != "0" ] || [ "$HE_TME_EXIT" != "0" ]; then + # Combined exit code: non-zero if any run failed + if [ "$SDK_EXIT" != "0" ] || [ "$PLATFORM_API_EXIT" != "0" ] || [ "$HE_TME_EXIT" != "0" ]; then COMBINED_EXIT=1 else COMBINED_EXIT=0 fi echo "sdk_exit=${SDK_EXIT}" >> $GITHUB_OUTPUT + echo "platform_api_exit=${PLATFORM_API_EXIT}" >> $GITHUB_OUTPUT echo "he_tme_exit=${HE_TME_EXIT}" >> $GITHUB_OUTPUT echo "combined_exit=${COMBINED_EXIT}" >> $GITHUB_OUTPUT @@ -229,6 +253,56 @@ jobs: "${BETTERSTACK_HEARTBEAT_URL}/${SDK_EXIT}" echo "INFO: Sent SDK heartbeat to BetterStack with exit code '${SDK_EXIT}'" + - name: Heartbeat / BetterStack / Platform API + if: always() + shell: bash + env: + BETTERSTACK_HEARTBEAT_URL_PLATFORM_API: "${{ inputs.platform_environment == 'staging' && secrets.BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_STAGING || secrets.BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_PRODUCTION }}" + PLATFORM_API_EXIT: ${{ steps.collect.outputs.platform_api_exit }} + run: | + if [ -z "$BETTERSTACK_HEARTBEAT_URL_PLATFORM_API" ]; then + echo "INFO: No BetterStack Platform API heartbeat URL configured, skipped." + exit 0 + fi + BETTERSTACK_METADATA_PAYLOAD=$(jq -n \ + --arg github_workflow "${{ github.workflow }}" \ + --arg github_run_url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ + --arg github_run_id "${{ github.run_id }}" \ + --arg github_job "${{ github.job }}" \ + --arg github_sha "${{ github.sha }}" \ + --arg github_actor "${{ github.actor }}" \ + --arg github_repository "${{ github.repository }}" \ + --arg github_ref "${{ github.ref }}" \ + --arg job_status "${{ job.status }}" \ + --arg github_event_name "${{ github.event_name }}" \ + --arg timestamp "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" \ + '{ + github: { + workflow: $github_workflow, + run_url: $github_run_url, + run_id: $github_run_id, + job: $github_job, + sha: $github_sha, + actor: $github_actor, + repository: $github_repository, + ref: $github_ref, + event_name: $github_event_name + }, + job: { + status: $job_status, + }, + timestamp: $timestamp, + }' + ) + curl \ + --fail-with-body \ + --silent \ + --request POST \ + --header "Content-Type: application/json" \ + --data-binary "${BETTERSTACK_METADATA_PAYLOAD}" \ + "${BETTERSTACK_HEARTBEAT_URL_PLATFORM_API}/${PLATFORM_API_EXIT}" + echo "INFO: Sent Platform API heartbeat to BetterStack with exit code '${PLATFORM_API_EXIT}'" + - name: Heartbeat / BetterStack / HE-TME if: always() shell: bash @@ -286,6 +360,7 @@ jobs: name: test-results-scheduled path: | reports/junit_sdk.xml + reports/junit_platform_api.xml reports/junit_he_tme.xml reports/coverage.xml reports/coverage.md diff --git a/.github/workflows/scheduled-testing-production-hourly.yml b/.github/workflows/scheduled-testing-production-hourly.yml index b991c4923..1147f7aaf 100644 --- a/.github/workflows/scheduled-testing-production-hourly.yml +++ b/.github/workflows/scheduled-testing-production-hourly.yml @@ -25,9 +25,11 @@ jobs: GCP_CREDENTIALS_STAGING: ${{ secrets.GCP_CREDENTIALS_STAGING }} BETTERSTACK_HEARTBEAT_URL_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_STAGING }} BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING }} + BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_STAGING }} AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION: ${{ secrets.AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION }} AIGNOSTICS_REFRESH_TOKEN_PRODUCTION: ${{ secrets.AIGNOSTICS_REFRESH_TOKEN_PRODUCTION }} GCP_CREDENTIALS_PRODUCTION: ${{ secrets.GCP_CREDENTIALS_PRODUCTION }} BETTERSTACK_HEARTBEAT_URL_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_PRODUCTION }} BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION }} + BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_PRODUCTION }} SENTRY_DSN: ${{ secrets.SENTRY_DSN }} # For metrics and heartbeat diff --git a/.github/workflows/scheduled-testing-staging-hourly.yml b/.github/workflows/scheduled-testing-staging-hourly.yml index 7f8286210..767663787 100644 --- a/.github/workflows/scheduled-testing-staging-hourly.yml +++ b/.github/workflows/scheduled-testing-staging-hourly.yml @@ -25,9 +25,11 @@ jobs: GCP_CREDENTIALS_STAGING: ${{ secrets.GCP_CREDENTIALS_STAGING }} BETTERSTACK_HEARTBEAT_URL_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_STAGING }} BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_STAGING }} + BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_STAGING: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_STAGING }} AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION: ${{ secrets.AIGNOSTICS_CLIENT_ID_DEVICE_PRODUCTION }} AIGNOSTICS_REFRESH_TOKEN_PRODUCTION: ${{ secrets.AIGNOSTICS_REFRESH_TOKEN_PRODUCTION }} GCP_CREDENTIALS_PRODUCTION: ${{ secrets.GCP_CREDENTIALS_PRODUCTION }} BETTERSTACK_HEARTBEAT_URL_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_PRODUCTION }} BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_HE_TME_PRODUCTION }} + BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_PRODUCTION: ${{ secrets.BETTERSTACK_HEARTBEAT_URL_PLATFORM_API_PRODUCTION }} SENTRY_DSN: ${{ secrets.SENTRY_DSN }} # For metrics and heartbeat diff --git a/pyproject.toml b/pyproject.toml index 62c181245..0efb14d22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -384,6 +384,7 @@ markers = [ "integration: Sociable integration tests - test interactions across architectural layers (e.g. CLI/GUI→Service, Service→Utils) or between modules (e.g. Application→Platform), using real SDK collaborators, real file I/O, real subprocesses, and real Docker containers. Integration test must be able to pass offline, i.e. mock external services (Aignostics Platform API, Auth0, S3/GCS buckets, IDC). The timeout should not be bigger than the default 10s, and must be <5 min.", "e2e: End-to-end tests - test complete workflows with real external network services (Aignostics Platform API, cloud storage, IDC, etc). If the test timeout is >= 5 min and < 60 min, additionally mark as `long_running`, if >= 60min mark as 'very_long_running'.", "monitors: Tag a scheduled test with the application it monitors, e.g. @pytest.mark.monitors('he-tme'). Tests without this marker are considered SDK-layer health checks. Used to route Better Stack heartbeats to the correct monitor.", + "monitors_platform_api: Tag a scheduled test that monitors the Platform API layer (auth, listing, connectivity). Used to route Platform API Better Stack heartbeats separately from SDK and application monitors.", ] md_report = true md_report_output = "reports/pytest.md" diff --git a/tests/aignostics/application/cli_test.py b/tests/aignostics/application/cli_test.py index c5d4a2e90..5ad953d89 100644 --- a/tests/aignostics/application/cli_test.py +++ b/tests/aignostics/application/cli_test.py @@ -164,6 +164,8 @@ def test_cli_application_list_non_verbose(runner: CliRunner, record_property) -> @pytest.mark.e2e @pytest.mark.scheduled +@pytest.mark.monitors("platform-api") +@pytest.mark.monitors_platform_api @pytest.mark.timeout(timeout=60) def test_cli_application_list_verbose(runner: CliRunner, record_property) -> None: """Check application list command runs successfully.""" @@ -801,6 +803,8 @@ def test_cli_run_submit_and_describe_and_cancel_and_download_and_delete( # noqa @pytest.mark.e2e @pytest.mark.scheduled +@pytest.mark.monitors("platform-api") +@pytest.mark.monitors_platform_api @pytest.mark.timeout(timeout=60) def test_cli_run_list_limit_10(runner: CliRunner, record_property) -> None: """Check run list command runs successfully.""" diff --git a/tests/aignostics/system/cli_test.py b/tests/aignostics/system/cli_test.py index f3254b499..f71f0f470 100644 --- a/tests/aignostics/system/cli_test.py +++ b/tests/aignostics/system/cli_test.py @@ -43,6 +43,8 @@ def test_cli_health_yaml_format(mock_service: MagicMock, runner: CliRunner, reco @pytest.mark.e2e @pytest.mark.scheduled +@pytest.mark.monitors("platform-api") +@pytest.mark.monitors_platform_api @pytest.mark.timeout(timeout=60) def test_cli_health_json(runner: CliRunner) -> None: """Check health CLI returns valid JSON with a valid status value."""