From 951d100dd8d922a6efaac7373bc16561f17dbdf8 Mon Sep 17 00:00:00 2001
From: "federico.spatola" <federico.spatola@immobiliare.it>
Date: Wed, 6 May 2026 09:26:57 +0200
Subject: [PATCH 1/2] add papermill-based notebook tests for pyiceberg examples

---
 Makefile                                      |   5 +-
 pyproject.toml                                |   4 +
 tests/notebooks/test_pyiceberg_example.py     |  84 ++++++++++
 .../test_spark_integration_example.py         | 153 ++++++++++++++++++
 uv.lock                                       |  37 ++++-
 5 files changed, 281 insertions(+), 2 deletions(-)
 create mode 100644 tests/notebooks/test_pyiceberg_example.py
 create mode 100644 tests/notebooks/test_spark_integration_example.py

diff --git a/Makefile b/Makefile
index d262de45a9..4fe761192c 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 .PHONY: help install install-uv check-license lint \
         test test-integration test-integration-setup test-integration-exec test-integration-cleanup test-integration-rebuild \
-        test-s3 test-adls test-gcs test-coverage coverage-report \
+        test-s3 test-adls test-gcs test-coverage coverage-report test test-notebook\
         docs-serve docs-build notebook notebook-infra \
         clean
 
@@ -150,6 +150,9 @@ coverage-report: ## Combine and report coverage
 	uv run $(PYTHON_ARG) coverage html
 	uv run $(PYTHON_ARG) coverage xml
 
+test-notebook: ## Run notebook tests (pyiceberg_example and spark_integration_example) via papermill
+	$(TEST_RUNNER) pytest tests/notebooks/test_pyiceberg_example.py tests/notebooks/test_spark_integration_example.py -m notebook $(PYTEST_ARGS)
+
 # ================
 # Documentation
 # ================
diff --git a/pyproject.toml b/pyproject.toml
index ac1177db44..96118f8451 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,6 +122,9 @@ dev = [
     "google-cloud-bigquery>=3.33.0,<4",
     "pyarrow-stubs>=20.0.0.20251107", # Remove when pyarrow >= 23.0.0 https://github.com/apache/arrow/pull/47609
     "sqlalchemy>=2.0.18,<3",
+    "papermill>=2.6.0",
+    "nbformat>=5.10.0",
+    "ipykernel>=6.29.0",
 ]
 # for mkdocs
 docs = [
@@ -161,6 +164,7 @@ markers = [
   "integration: marks integration tests against Apache Spark",
   "gcs: marks a test as requiring access to gcs compliant storage (use with --gs.token, --gs.project, and --gs.endpoint)",
   "benchmark: collection of tests to validate read/write performance before and after a change",
+  "notebook: marks tests that execute Jupyter notebooks via papermill",
 ]
 
 # Turns a warning into an error
diff --git a/tests/notebooks/test_pyiceberg_example.py b/tests/notebooks/test_pyiceberg_example.py
new file mode 100644
index 0000000000..ab73fc72d4
--- /dev/null
+++ b/tests/notebooks/test_pyiceberg_example.py
@@ -0,0 +1,84 @@
+from pathlib import Path
+
+import nbformat
+import papermill as pm
+import pytest
+
+pytestmark = pytest.mark.notebook
+
+NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" / "pyiceberg_example.ipynb"
+
+
+def get_all_stdout(nb: nbformat.NotebookNode) -> str:
+    """Concatenate all stdout streams from every executed cell."""
+    return "".join(
+        out.get("text", "")
+        for cell in nb.cells
+        for out in cell.get("outputs", [])
+        if out.get("output_type") == "stream" and out.get("name") == "stdout"
+    )
+
+
+@pytest.fixture(scope="session")
+def pyiceberg_nb(tmp_path_factory: pytest.TempPathFactory) -> nbformat.NotebookNode:
+    out = tmp_path_factory.mktemp("nb_out") / "pyiceberg_example_out.ipynb"
+    return pm.execute_notebook(str(NOTEBOOK_PATH), str(out), kernel_name="python3")
+
+
+class TestSmoke:
+    def test_notebook_completes_without_error(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        """papermill raises PapermillExecutionError if any cell fails."""
+        assert pyiceberg_nb is not None
+
+    def test_all_code_cells_executed(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        for cell in pyiceberg_nb.cells:
+            if cell.cell_type == "code":
+                assert cell.get("execution_count") is not None, f"Cell not executed:\n{cell.source[:80]}"
+
+
+class TestCellOutputs:
+    def test_pyiceberg_version_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "PyIceberg version:" in get_all_stdout(pyiceberg_nb)
+
+    def test_warehouse_location_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert "Warehouse location:" in stdout
+        assert "iceberg_warehouse_" in stdout
+
+    def test_catalog_loaded_successfully(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "Catalog loaded successfully!" in get_all_stdout(pyiceberg_nb)
+
+    def test_namespace_default_created(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "default" in get_all_stdout(pyiceberg_nb)
+
+    def test_rows_written_is_five(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "Rows written: 5" in get_all_stdout(pyiceberg_nb)
+
+    def test_schema_evolved_message(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "Schema evolved!" in get_all_stdout(pyiceberg_nb)
+
+    def test_tip_per_mile_column_present_after_evolution(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "tip_per_mile" in get_all_stdout(pyiceberg_nb)
+
+    def test_filter_result_is_positive(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        """The notebook prints 'Rows with tip_per_mile > 1.0: N' — N must be > 0."""
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert "Rows with tip_per_mile > 1.0:" in stdout
+        for line in stdout.splitlines():
+            if "Rows with tip_per_mile > 1.0:" in line:
+                count = int(line.split(":")[-1].strip())
+                assert count > 0
+                break
+
+    def test_snapshot_id_printed(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        assert "Current snapshot ID:" in get_all_stdout(pyiceberg_nb)
+
+    def test_table_history_has_entries(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert "Table history:" in stdout
+        assert "Snapshot:" in stdout
+
+    def test_warehouse_contains_parquet_and_metadata_files(self, pyiceberg_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(pyiceberg_nb)
+        assert ".parquet" in stdout
+        assert ".metadata.json" in stdout
diff --git a/tests/notebooks/test_spark_integration_example.py b/tests/notebooks/test_spark_integration_example.py
new file mode 100644
index 0000000000..2388e55662
--- /dev/null
+++ b/tests/notebooks/test_spark_integration_example.py
@@ -0,0 +1,153 @@
+import textwrap
+from pathlib import Path
+
+import nbformat
+import papermill as pm
+import pytest
+
+pytestmark = pytest.mark.notebook
+
+NOTEBOOK_PATH = Path(__file__).parents[2] / "notebooks" / "spark_integration_example.ipynb"
+
+# ---------------------------------------------------------------------------
+# Mock pyspark
+# Replaces pyspark.sql.SparkSession with a fake one
+# ---------------------------------------------------------------------------
+_MOCK_PYSPARK = textwrap.dedent("""\
+    import sys
+    import types
+    from unittest.mock import MagicMock
+
+    def _make_fake_pyspark():
+        pyspark_mod = types.ModuleType("pyspark")
+        sql_mod     = types.ModuleType("pyspark.sql")
+        pyspark_mod.sql = sql_mod
+        sys.modules.setdefault("pyspark",     pyspark_mod)
+        sys.modules.setdefault("pyspark.sql", sql_mod)
+        return pyspark_mod, sql_mod
+
+    _pyspark, _sql = _make_fake_pyspark()
+
+    _SHOW_CATALOGS = (
+        "+-------------+\\n"
+        "|catalogName  |\\n"
+        "+-------------+\\n"
+        "|spark_catalog|\\n"
+        "|local        |\\n"
+        "+-------------+\\n"
+    )
+    _SHOW_NAMESPACES = (
+        "+---------+\\n"
+        "|namespace|\\n"
+        "+---------+\\n"
+        "|default  |\\n"
+        "+---------+\\n"
+    )
+    _SHOW_TABLES = (
+        "+---------+-----------+-----------+\\n"
+        "|namespace|tableName  |isTemporary|\\n"
+        "+---------+-----------+-----------+\\n"
+        "|default  |test_all   |false      |\\n"
+        "+---------+-----------+-----------+\\n"
+    )
+    _DESCRIBE_TABLE = (
+        "+--------------------+---------+-------+\\n"
+        "|col_name            |data_type|comment|\\n"
+        "+--------------------+---------+-------+\\n"
+        "|boolean_col         |boolean  |null   |\\n"
+        "|integer_col         |integer  |null   |\\n"
+        "+--------------------+---------+-------+\\n"
+    )
+    _SQL_RESPONSES = {
+        "SHOW CATALOGS":                        _SHOW_CATALOGS,
+        "SHOW NAMESPACES":                       _SHOW_NAMESPACES,
+        "SHOW TABLES FROM default":              _SHOW_TABLES,
+        "DESCRIBE TABLE default.test_all_types": _DESCRIBE_TABLE,
+    }
+
+    def _make_df(output):
+        df = MagicMock()
+        df.show.side_effect = lambda *a, **kw: print(output, end="")
+        return df
+
+    class _FakeBuilder:
+        def remote(self, url): return self
+        def getOrCreate(self): return _FakeSession()
+
+    class _FakeSession:
+        builder = _FakeBuilder()
+        def sql(self, query):
+            key = query.strip().rstrip(";")
+            output = _SQL_RESPONSES.get(key, "+------+\\n| col  |\\n+------+\\n| val  |\\n+------+\\n")
+            return _make_df(output)
+
+    _FakeSparkSession = MagicMock(spec=object)
+    _FakeSparkSession.builder = _FakeBuilder()
+    _sql.SparkSession = _FakeSparkSession
+""")
+
+
+def get_all_stdout(nb: nbformat.NotebookNode) -> str:
+    """Concatenate all stdout streams from every executed cell."""
+    return "".join(
+        out.get("text", "")
+        for cell in nb.cells
+        for out in cell.get("outputs", [])
+        if out.get("output_type") == "stream" and out.get("name") == "stdout"
+    )
+
+
+def _inject_mock_and_execute(notebook_path: Path, output_path: Path) -> nbformat.NotebookNode:
+    """
+    Load the real notebook, prepend the mock-pyspark setup cell, write to a
+    temporary copy and execute it with papermill.
+    """
+    nb = nbformat.read(str(notebook_path), as_version=4)
+
+    mock_cell = nbformat.v4.new_code_cell(_MOCK_PYSPARK)
+    mock_cell.metadata["tags"] = ["injected-mock"]
+    nb.cells.insert(0, mock_cell)
+
+    patched_path = output_path.parent / "spark_patched.ipynb"
+    nbformat.write(nb, str(patched_path))
+
+    return pm.execute_notebook(str(patched_path), str(output_path), kernel_name="python3")
+
+
+@pytest.fixture(scope="session")
+def spark_nb(tmp_path_factory: pytest.TempPathFactory) -> nbformat.NotebookNode:
+    out = tmp_path_factory.mktemp("nb_out") / "spark_integration_example_out.ipynb"
+    return _inject_mock_and_execute(NOTEBOOK_PATH, out)
+
+
+class TestSmoke:
+    def test_notebook_completes_without_error(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert spark_nb is not None
+
+    def test_all_code_cells_executed(self, spark_nb: nbformat.NotebookNode) -> None:
+        for cell in spark_nb.cells:
+            if cell.cell_type == "code":
+                assert cell.get("execution_count") is not None, f"Cell not executed:\n{cell.source[:80]}"
+
+
+class TestCellOutputs:
+    def test_show_catalogs_lists_spark_catalog_and_local(self, spark_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(spark_nb)
+        assert "spark_catalog" in stdout
+        assert "local" in stdout
+
+    def test_show_namespaces_contains_default(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert "default" in get_all_stdout(spark_nb)
+
+    def test_show_tables_produces_tabular_output(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert "+---------+-----------+-----------+" in get_all_stdout(spark_nb)
+
+    def test_describe_table_lists_column_names(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert "col_name" in get_all_stdout(spark_nb)
+
+    def test_describe_table_lists_data_types(self, spark_nb: nbformat.NotebookNode) -> None:
+        stdout = get_all_stdout(spark_nb)
+        assert "boolean" in stdout or "integer" in stdout
+
+    def test_show_tables_includes_test_table_row(self, spark_nb: nbformat.NotebookNode) -> None:
+        assert "test_all" in get_all_stdout(spark_nb)
diff --git a/uv.lock b/uv.lock
index 5a3c46dc44..4c746ac988 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1372,12 +1372,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7a/e3/9d34173ec068631faea3ea6e73050700729363e7e33306a9a3218e5cdc61/duckdb-1.5.2-cp314-cp314-win_arm64.whl", hash = "sha256:c9f3e0b71b8a50fccfb42794899285d9d318ce2503782b9dd54868e5ecd0ad31", size = 14402513, upload-time = "2026-04-13T11:30:06.609Z" },
 ]
 
+[[package]]
+name = "entrypoints"
+version = "0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ea/8d/a7121ffe5f402dc015277d2d31eb82d2187334503a011c18f2e78ecbb9b2/entrypoints-0.4.tar.gz", hash = "sha256:b706eddaa9218a19ebcd67b56818f05bb27589b1ca9e8d797b74affad4ccacd4", size = 13974, upload-time = "2022-02-02T21:30:28.172Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/a8/365059bbcd4572cbc41de17fd5b682be5868b218c3c5479071865cab9078/entrypoints-0.4-py3-none-any.whl", hash = "sha256:f174b5ff827504fd3cd97cc3f8649f3693f51538c7e4bdf3ef002c8429d42f9f", size = 5294, upload-time = "2022-02-02T21:30:26.024Z" },
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
 wheels = [
@@ -3896,6 +3905,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ef/af/4fbc8cab944db5d21b7e2a5b8e9211a03a79852b1157e2c102fcc61ac440/pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc", size = 8663, upload-time = "2024-01-18T20:08:11.28Z" },
 ]
 
+[[package]]
+name = "papermill"
+version = "2.7.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp", marker = "python_full_version == '3.12.*'" },
+    { name = "click" },
+    { name = "entrypoints" },
+    { name = "nbclient" },
+    { name = "nbformat" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tenacity" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f8/b6/92d770c5ced66ed0134256f8de781e98c824d3a0662af1643a91fcc36663/papermill-2.7.0.tar.gz", hash = "sha256:ec10b37594a060662f57269e1ebd108c209d204450f00fdfeb70a1c7cfb7fbc8", size = 77961, upload-time = "2026-02-27T19:07:30.548Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/95/9f/f9fd57a727dcc89c54e84455d8317bff7db05ef21bb6d05b03705111f7c0/papermill-2.7.0-py3-none-any.whl", hash = "sha256:e1855e6670100a02bb4f8a6870484a5c10b84a8d2e49c49921c90209940c7514", size = 88858, upload-time = "2026-02-27T19:07:28.862Z" },
+]
+
 [[package]]
 name = "parso"
 version = "0.8.5"
@@ -4681,9 +4710,12 @@ dev = [
     { name = "docutils" },
     { name = "fastavro" },
     { name = "google-cloud-bigquery" },
+    { name = "ipykernel" },
     { name = "moto", extra = ["server"] },
     { name = "mypy-boto3-dynamodb" },
     { name = "mypy-boto3-glue" },
+    { name = "nbformat" },
+    { name = "papermill" },
     { name = "prek" },
     { name = "protobuf" },
     { name = "pyarrow-stubs" },
@@ -4771,9 +4803,12 @@ dev = [
     { name = "docutils", specifier = "!=0.21.post1" },
     { name = "fastavro", specifier = "==1.12.2" },
     { name = "google-cloud-bigquery", specifier = ">=3.33.0,<4" },
+    { name = "ipykernel", specifier = ">=6.29.0" },
     { name = "moto", extras = ["server"], specifier = ">=5.0.2,<6" },
     { name = "mypy-boto3-dynamodb", specifier = ">=1.28.18" },
     { name = "mypy-boto3-glue", specifier = ">=1.28.18" },
+    { name = "nbformat", specifier = ">=5.10.0" },
+    { name = "papermill", specifier = ">=2.6.0" },
     { name = "prek", specifier = ">=0.2.1,<0.4" },
     { name = "protobuf", specifier = "==6.33.5" },
     { name = "pyarrow-stubs", specifier = ">=20.0.0.20251107" },

From a7d4b945653debbc07e714a73be52f0de9270631 Mon Sep 17 00:00:00 2001
From: "federico.spatola" <federico.spatola@immobiliare.it>
Date: Wed, 6 May 2026 15:45:00 +0200
Subject: [PATCH 2/2] test: add Apache license header to the new notebook test
 files

---
 tests/notebooks/test_pyiceberg_example.py       | 17 +++++++++++++++++
 .../notebooks/test_spark_integration_example.py | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/tests/notebooks/test_pyiceberg_example.py b/tests/notebooks/test_pyiceberg_example.py
index ab73fc72d4..eea5b49963 100644
--- a/tests/notebooks/test_pyiceberg_example.py
+++ b/tests/notebooks/test_pyiceberg_example.py
@@ -1,3 +1,20 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
 from pathlib import Path
 
 import nbformat
diff --git a/tests/notebooks/test_spark_integration_example.py b/tests/notebooks/test_spark_integration_example.py
index 2388e55662..e242e43157 100644
--- a/tests/notebooks/test_spark_integration_example.py
+++ b/tests/notebooks/test_spark_integration_example.py
@@ -1,3 +1,20 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
 import textwrap
 from pathlib import Path