From 951d100dd8d922a6efaac7373bc16561f17dbdf8 Mon Sep 17 00:00:00 2001 From: "federico.spatola" Date: Wed, 6 May 2026 09:26:57 +0200 Subject: [PATCH 1/2] add papermill-based notebook tests for pyiceberg examples --- Makefile | 5 +- pyproject.toml | 4 + tests/notebooks/test_pyiceberg_example.py | 84 ++++++++++ .../test_spark_integration_example.py | 153 ++++++++++++++++++ uv.lock | 37 ++++- 5 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 tests/notebooks/test_pyiceberg_example.py create mode 100644 tests/notebooks/test_spark_integration_example.py diff --git a/Makefile b/Makefile index d262de45a9..4fe761192c 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ # under the License. .PHONY: help install install-uv check-license lint \ test test-integration test-integration-setup test-integration-exec test-integration-cleanup test-integration-rebuild \ - test-s3 test-adls test-gcs test-coverage coverage-report \ + test-s3 test-adls test-gcs test-coverage coverage-report test test-notebook\ docs-serve docs-build notebook notebook-infra \ clean @@ -150,6 +150,9 @@ coverage-report: ## Combine and report coverage uv run $(PYTHON_ARG) coverage html uv run $(PYTHON_ARG) coverage xml +test-notebook: ## Run notebook tests (pyiceberg_example and spark_integration_example) via papermill + $(TEST_RUNNER) pytest tests/notebooks/test_pyiceberg_example.py tests/notebooks/test_spark_integration_example.py -m notebook $(PYTEST_ARGS) + # ================ # Documentation # ================ diff --git a/pyproject.toml b/pyproject.toml index ac1177db44..96118f8451 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -122,6 +122,9 @@ dev = [ "google-cloud-bigquery>=3.33.0,<4", "pyarrow-stubs>=20.0.0.20251107", # Remove when pyarrow >= 23.0.0 https://github.com/apache/arrow/pull/47609 "sqlalchemy>=2.0.18,<3", + "papermill>=2.6.0", + "nbformat>=5.10.0", + "ipykernel>=6.29.0", ] # for mkdocs docs = [ @@ -161,6 +164,7 @@ markers = [ "integration: marks integration tests against Apache Spark", "gcs: marks a test as requiring access to gcs compliant storage (use with --gs.token, --gs.project, and --gs.endpoint)", "benchmark: collection of tests to validate read/write performance before and after a change", + "notebook: marks tests that execute Jupyter notebooks via papermill", ] # Turns a warning into an error diff --git a/tests/notebooks/test_pyiceberg_example.py b/tests/notebooks/test_pyiceberg_example.py new file mode 100644 index 0000000000..ab73fc72d4 --- /dev/null +++ b/tests/notebooks/test_pyiceberg_example.py @@ -0,0 +1,84 @@ +from pathlib import Path + +import nbformat +import papermill as pm +import pytest + +pytestmark = pytest.mark.notebook + +NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" / "pyiceberg_example.ipynb" + + +def get_all_stdout(nb: nbformat.NotebookNode) -> str: + """Concatenate all stdout streams from every executed cell.""" + return "".join( + out.get("text", "") + for cell in nb.cells + for out in cell.get("outputs", []) + if out.get("output_type") == "stream" and out.get("name") == "stdout" + ) + + +@pytest.fixture(scope="session") +def pyiceberg_nb(tmp_path_factory: pytest.TempPathFactory) -> nbformat.NotebookNode: + out = tmp_path_factory.mktemp("nb_out") / "pyiceberg_example_out.ipynb" + return pm.execute_notebook(str(NOTEBOOK_PATH), str(out), kernel_name="python3") + + +class TestSmoke: + def test_notebook_completes_without_error(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + """papermill raises PapermillExecutionError if any cell fails.""" + assert pyiceberg_nb is not None + + def test_all_code_cells_executed(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + for cell in pyiceberg_nb.cells: + if cell.cell_type == "code": + assert cell.get("execution_count") is not None, f"Cell not executed:\n{cell.source[:80]}" + + +class TestCellOutputs: + def test_pyiceberg_version_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + assert "PyIceberg version:" in get_all_stdout(pyiceberg_nb) + + def test_warehouse_location_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + stdout = get_all_stdout(pyiceberg_nb) + assert "Warehouse location:" in stdout + assert "iceberg_warehouse_" in stdout + + def test_catalog_loaded_successfully(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + assert "Catalog loaded successfully!" in get_all_stdout(pyiceberg_nb) + + def test_namespace_default_created(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + assert "default" in get_all_stdout(pyiceberg_nb) + + def test_rows_written_is_five(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + assert "Rows written: 5" in get_all_stdout(pyiceberg_nb) + + def test_schema_evolved_message(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + assert "Schema evolved!" in get_all_stdout(pyiceberg_nb) + + def test_tip_per_mile_column_present_after_evolution(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + assert "tip_per_mile" in get_all_stdout(pyiceberg_nb) + + def test_filter_result_is_positive(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + """The notebook prints 'Rows with tip_per_mile > 1.0: N' — N must be > 0.""" + stdout = get_all_stdout(pyiceberg_nb) + assert "Rows with tip_per_mile > 1.0:" in stdout + for line in stdout.splitlines(): + if "Rows with tip_per_mile > 1.0:" in line: + count = int(line.split(":")[-1].strip()) + assert count > 0 + break + + def test_snapshot_id_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + assert "Current snapshot ID:" in get_all_stdout(pyiceberg_nb) + + def test_table_history_has_entries(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + stdout = get_all_stdout(pyiceberg_nb) + assert "Table history:" in stdout + assert "Snapshot:" in stdout + + def test_warehouse_contains_parquet_and_metadata_files(self, pyiceberg_nb: nbformat.NotebookNode) -> None: + stdout = get_all_stdout(pyiceberg_nb) + assert ".parquet" in stdout + assert ".metadata.json" in stdout diff --git a/tests/notebooks/test_spark_integration_example.py b/tests/notebooks/test_spark_integration_example.py new file mode 100644 index 0000000000..2388e55662 --- /dev/null +++ b/tests/notebooks/test_spark_integration_example.py @@ -0,0 +1,153 @@ +import textwrap +from pathlib import Path + +import nbformat +import papermill as pm +import pytest + +pytestmark = pytest.mark.notebook + +NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" / "spark_integration_example.ipynb" + +# --------------------------------------------------------------------------- +# Mock pyspark +# Replaces pyspark.sql.SparkSession with a fake one +# --------------------------------------------------------------------------- +_MOCK_PYSPARK = textwrap.dedent("""\ + import sys + import types + from unittest.mock import MagicMock + + def _make_fake_pyspark(): + pyspark_mod = types.ModuleType("pyspark") + sql_mod = types.ModuleType("pyspark.sql") + pyspark_mod.sql = sql_mod + sys.modules.setdefault("pyspark", pyspark_mod) + sys.modules.setdefault("pyspark.sql", sql_mod) + return pyspark_mod, sql_mod + + _pyspark, _sql = _make_fake_pyspark() + + _SHOW_CATALOGS = ( + "+-------------+\\n" + "|catalogName |\\n" + "+-------------+\\n" + "|spark_catalog|\\n" + "|local |\\n" + "+-------------+\\n" + ) + _SHOW_NAMESPACES = ( + "+---------+\\n" + "|namespace|\\n" + "+---------+\\n" + "|default |\\n" + "+---------+\\n" + ) + _SHOW_TABLES = ( + "+---------+-----------+-----------+\\n" + "|namespace|tableName |isTemporary|\\n" + "+---------+-----------+-----------+\\n" + "|default |test_all |false |\\n" + "+---------+-----------+-----------+\\n" + ) + _DESCRIBE_TABLE = ( + "+--------------------+---------+-------+\\n" + "|col_name |data_type|comment|\\n" + "+--------------------+---------+-------+\\n" + "|boolean_col |boolean |null |\\n" + "|integer_col |integer |null |\\n" + "+--------------------+---------+-------+\\n" + ) + _SQL_RESPONSES = { + "SHOW CATALOGS": _SHOW_CATALOGS, + "SHOW NAMESPACES": _SHOW_NAMESPACES, + "SHOW TABLES FROM default": _SHOW_TABLES, + "DESCRIBE TABLE default.test_all_types": _DESCRIBE_TABLE, + } + + def _make_df(output): + df = MagicMock() + df.show.side_effect = lambda *a, **kw: print(output, end="") + return df + + class _FakeBuilder: + def remote(self, url): return self + def getOrCreate(self): return _FakeSession() + + class _FakeSession: + builder = _FakeBuilder() + def sql(self, query): + key = query.strip().rstrip(";") + output = _SQL_RESPONSES.get(key, "+------+\\n| col |\\n+------+\\n| val |\\n+------+\\n") + return _make_df(output) + + _FakeSparkSession = MagicMock(spec=object) + _FakeSparkSession.builder = _FakeBuilder() + _sql.SparkSession = _FakeSparkSession +""") + + +def get_all_stdout(nb: nbformat.NotebookNode) -> str: + """Concatenate all stdout streams from every executed cell.""" + return "".join( + out.get("text", "") + for cell in nb.cells + for out in cell.get("outputs", []) + if out.get("output_type") == "stream" and out.get("name") == "stdout" + ) + + +def _inject_mock_and_execute(notebook_path: Path, output_path: Path) -> nbformat.NotebookNode: + """ + Load the real notebook, prepend the mock-pyspark setup cell, write to a + temporary copy and execute it with papermill. + """ + nb = nbformat.read(str(notebook_path), as_version=4) + + mock_cell = nbformat.v4.new_code_cell(_MOCK_PYSPARK) + mock_cell.metadata["tags"] = ["injected-mock"] + nb.cells.insert(0, mock_cell) + + patched_path = output_path.parent / "spark_patched.ipynb" + nbformat.write(nb, str(patched_path)) + + return pm.execute_notebook(str(patched_path), str(output_path), kernel_name="python3") + + +@pytest.fixture(scope="session") +def spark_nb(tmp_path_factory: pytest.TempPathFactory) -> nbformat.NotebookNode: + out = tmp_path_factory.mktemp("nb_out") / "spark_integration_example_out.ipynb" + return _inject_mock_and_execute(NOTEBOOK_PATH, out) + + +class TestSmoke: + def test_notebook_completes_without_error(self, spark_nb: nbformat.NotebookNode) -> None: + assert spark_nb is not None + + def test_all_code_cells_executed(self, spark_nb: nbformat.NotebookNode) -> None: + for cell in spark_nb.cells: + if cell.cell_type == "code": + assert cell.get("execution_count") is not None, f"Cell not executed:\n{cell.source[:80]}" + + +class TestCellOutputs: + def test_show_catalogs_lists_spark_catalog_and_local(self, spark_nb: nbformat.NotebookNode) -> None: + stdout = get_all_stdout(spark_nb) + assert "spark_catalog" in stdout + assert "local" in stdout + + def test_show_namespaces_contains_default(self, spark_nb: nbformat.NotebookNode) -> None: + assert "default" in get_all_stdout(spark_nb) + + def test_show_tables_produces_tabular_output(self, spark_nb: nbformat.NotebookNode) -> None: + assert "+---------+-----------+-----------+" in get_all_stdout(spark_nb) + + def test_describe_table_lists_column_names(self, spark_nb: nbformat.NotebookNode) -> None: + assert "col_name" in get_all_stdout(spark_nb) + + def test_describe_table_lists_data_types(self, spark_nb: nbformat.NotebookNode) -> None: + stdout = get_all_stdout(spark_nb) + assert "boolean" in stdout or "integer" in stdout + + def test_show_tables_includes_test_table_row(self, spark_nb: nbformat.NotebookNode) -> None: + assert "test_all" in get_all_stdout(spark_nb) diff --git a/uv.lock b/uv.lock index 5a3c46dc44..4c746ac988 100644 --- a/uv.lock +++ b/uv.lock @@ -1372,12 +1372,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7a/e3/9d34173ec068631faea3ea6e73050700729363e7e33306a9a3218e5cdc61/duckdb-1.5.2-cp314-cp314-win_arm64.whl", hash = "sha256:c9f3e0b71b8a50fccfb42794899285d9d318ce2503782b9dd54868e5ecd0ad31", size = 14402513, upload-time = "2026-04-13T11:30:06.609Z" }, ] +[[package]] +name = "entrypoints" +version = "0.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ea/8d/a7121ffe5f402dc015277d2d31eb82d2187334503a011c18f2e78ecbb9b2/entrypoints-0.4.tar.gz", hash = "sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4", size = 13974, upload-time = "2022-02-02T21:30:28.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/a8/365059bbcd4572cbc41de17fd5b682be5868b218c3c5479071865cab9078/entrypoints-0.4-py3-none-any.whl", hash = "sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f", size = 5294, upload-time = "2022-02-02T21:30:26.024Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } wheels = [ @@ -3896,6 +3905,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/af/4fbc8cab944db5d21b7e2a5b8e9211a03a79852b1157e2c102fcc61ac440/pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc", size = 8663, upload-time = "2024-01-18T20:08:11.28Z" }, ] +[[package]] +name = "papermill" +version = "2.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp", marker = "python_full_version == '3.12.*'" }, + { name = "click" }, + { name = "entrypoints" }, + { name = "nbclient" }, + { name = "nbformat" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tenacity" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f8/b6/92d770c5ced66ed0134256f8de781e98c824d3a0662af1643a91fcc36663/papermill-2.7.0.tar.gz", hash = "sha256:ec10b37594a060662f57269e1ebd108c209d204450f00fdfeb70a1c7cfb7fbc8", size = 77961, upload-time = "2026-02-27T19:07:30.548Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/9f/f9fd57a727dcc89c54e84455d8317bff7db05ef21bb6d05b03705111f7c0/papermill-2.7.0-py3-none-any.whl", hash = "sha256:e1855e6670100a02bb4f8a6870484a5c10b84a8d2e49c49921c90209940c7514", size = 88858, upload-time = "2026-02-27T19:07:28.862Z" }, +] + [[package]] name = "parso" version = "0.8.5" @@ -4681,9 +4710,12 @@ dev = [ { name = "docutils" }, { name = "fastavro" }, { name = "google-cloud-bigquery" }, + { name = "ipykernel" }, { name = "moto", extra = ["server"] }, { name = "mypy-boto3-dynamodb" }, { name = "mypy-boto3-glue" }, + { name = "nbformat" }, + { name = "papermill" }, { name = "prek" }, { name = "protobuf" }, { name = "pyarrow-stubs" }, @@ -4771,9 +4803,12 @@ dev = [ { name = "docutils", specifier = "!=0.21.post1" }, { name = "fastavro", specifier = "==1.12.2" }, { name = "google-cloud-bigquery", specifier = ">=3.33.0,<4" }, + { name = "ipykernel", specifier = ">=6.29.0" }, { name = "moto", extras = ["server"], specifier = ">=5.0.2,<6" }, { name = "mypy-boto3-dynamodb", specifier = ">=1.28.18" }, { name = "mypy-boto3-glue", specifier = ">=1.28.18" }, + { name = "nbformat", specifier = ">=5.10.0" }, + { name = "papermill", specifier = ">=2.6.0" }, { name = "prek", specifier = ">=0.2.1,<0.4" }, { name = "protobuf", specifier = "==6.33.5" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20251107" }, From a7d4b945653debbc07e714a73be52f0de9270631 Mon Sep 17 00:00:00 2001 From: "federico.spatola" Date: Wed, 6 May 2026 15:45:00 +0200 Subject: [PATCH 2/2] test: add Apache license header to the new notebook test files --- tests/notebooks/test_pyiceberg_example.py | 17 +++++++++++++++++ .../notebooks/test_spark_integration_example.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/tests/notebooks/test_pyiceberg_example.py b/tests/notebooks/test_pyiceberg_example.py index ab73fc72d4..eea5b49963 100644 --- a/tests/notebooks/test_pyiceberg_example.py +++ b/tests/notebooks/test_pyiceberg_example.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + from pathlib import Path import nbformat diff --git a/tests/notebooks/test_spark_integration_example.py b/tests/notebooks/test_spark_integration_example.py index 2388e55662..e242e43157 100644 --- a/tests/notebooks/test_spark_integration_example.py +++ b/tests/notebooks/test_spark_integration_example.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import textwrap from pathlib import Path