From a602f42f2b7c977246b1c36312d2bed74b0a7288 Mon Sep 17 00:00:00 2001
From: Tim Condello <tim@pinecone.io>
Date: Tue, 21 Apr 2026 20:26:07 -0400
Subject: [PATCH 1/3] feat(bulk-import): add client-side parquet validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `index.bulk_import.validate(uri)` and the top-level
`pinecone.validate_bulk_import(uri)` helper so users can check parquet
files for schema and data correctness before sending them to the server.

- Reads only the parquet footer (schema) by default — no vector data
  downloaded even for large remote files
- Optionally samples up to N rows to detect null IDs, non-finite values,
  metadata JSON errors, and the 40 KB metadata size limit
- Supports single files and directories via the pyarrow filesystem
  abstraction (s3://, gs://, az:// URIs work automatically)
- Returns BulkImportValidationResult with .is_valid, .errors, .warnings,
  .files_checked, .rows_sampled; the .uri field can be passed directly
  to index.bulk_import.start()
- Verbose mode prints per-file OK/BAD lines and a final summary
- pyarrow is an optional dependency: pip install 'pinecone[parquet]'
- 40 unit tests covering schema validation, data sampling, and
  end-to-end file I/O via real parquet files on disk

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CLAUDE.md                                     |  69 +++
 pinecone/__init__.py                          |   8 +
 pinecone/__init__.pyi                         |   2 +
 pinecone/db_data/dataclasses/__init__.py      |   2 +
 .../bulk_import_validation_result.py          |  42 ++
 .../resources/asyncio/bulk_import_asyncio.py  |  24 +
 .../db_data/resources/sync/bulk_import.py     |  55 +++
 .../resources/sync/bulk_import_validator.py   | 409 +++++++++++++++
 pyproject.toml                                |   3 +
 tests/unit/data/test_bulk_import_validator.py | 466 ++++++++++++++++++
 uv.lock                                       |  65 ++-
 11 files changed, 1143 insertions(+), 2 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 pinecone/db_data/dataclasses/bulk_import_validation_result.py
 create mode 100644 pinecone/db_data/resources/sync/bulk_import_validator.py
 create mode 100644 tests/unit/data/test_bulk_import_validator.py

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000..f376d906a
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,69 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Development Setup
+
+```sh
+uv sync --extra grpc --extra asyncio   # install all dependencies
+uv run pre-commit install               # enable lint/format checks on commit
+```
+
+## Key Commands
+
+```sh
+make test-unit                          # run unit + grpc unit tests
+uv run pytest tests/unit                # REST unit tests only
+uv run pytest tests/unit_grpc           # gRPC unit tests only
+uv run pytest tests/unit/path/to/test_file.py::ClassName::test_method  # single test
+
+uv run mypy pinecone                    # type-check (excludes pinecone/core/)
+uv run ruff check --fix                 # lint
+uv run ruff format                      # format
+
+uv run repl                             # interactive REPL with pre-loaded Pinecone client
+
+make generate-oas                       # regenerate pinecone/core/openapi/ from OpenAPI specs
+```
+
+Integration tests make live Pinecone API calls and incur cost — only Pinecone employees should run them. Set credentials in `.env` (see `.env.example`) before running.
+
+## Architecture
+
+### Layer Overview
+
+```
+Pinecone / PineconeAsyncio   ← public entry point (pinecone/pinecone.py, pinecone_asyncio.py)
+    ├── DBControl             ← index/collection/backup management (pinecone/db_control/)
+    ├── DBData / Index        ← vector upsert/query/fetch/delete (pinecone/db_data/)
+    └── Inference             ← embedding and reranking models (pinecone/inference/)
+```
+
+`Pinecone` and `PineconeAsyncio` are thin facades. Each delegates to `DBControl` (control-plane operations) and returns `Index` / `IndexAsyncio` objects (data-plane operations). Inference is accessible via `pc.inference`.
+
+### Generated Code — Never Edit Manually
+
+`pinecone/core/openapi/` is fully generated from OpenAPI specs via `make generate-oas` (which runs `codegen/build-oas.sh`). The script calls the openapi-generator Docker image, applies several post-processing fixes (underscore field name normalization, datetime coercion removal, shared-class deduplication), then runs `ruff format`. **Do not hand-edit files in `pinecone/core/`.**
+
+Shared OpenAPI utilities (ApiClient, exceptions, model_utils, etc.) live in `pinecone/openapi_support/` rather than being duplicated across the five generated modules (`db_control`, `db_data`, `inference`, `oauth`, `admin`).
+
+### Adapter Layer
+
+`pinecone/adapters/` converts generated OpenAPI response objects into clean SDK dataclasses. This isolates the rest of the SDK from generated-model churn. When a new response type is needed, add it here rather than parsing OpenAPI objects in index.py or other business logic files.
+
+### Sync / Async Split
+
+Every stateful class has a sync and an async variant:
+- `DBControl` / `DBControlAsyncio`
+- `Index` (in `db_data/index.py`) / `IndexAsyncio` (in `db_data/index_asyncio.py`)
+- `Inference` / `AsyncioInference`
+
+The async variants use `aiohttp` (optional extra). The sync variants use `urllib3`. gRPC is a third transport option installed via the `grpc` extra; data-plane integration tests can be toggled to gRPC with `USE_GRPC=true`.
+
+### Lazy Imports
+
+`pinecone/__init__.py` defers most imports through `utils/lazy_imports.py` to keep module startup time fast. When adding new public symbols, register them in the lazy import maps in `__init__.py` rather than adding top-level imports. The `.pyi` stub (`__init__.pyi`) is the authoritative type-visible public API surface and must be kept in sync.
+
+### Testing Philosophy
+
+Unit tests are intentionally sparse — they cover data conversion edge cases (e.g. `VectorFactory`, `QueryResultsAggregator`) but not every method. Most confidence comes from integration tests. When writing unit tests, check `tests/unit/db_data/` for patterns. Fixtures and index setup/teardown for integration tests live in `conftest.py` files at each directory level.
diff --git a/pinecone/__init__.py b/pinecone/__init__.py
index 1064610c2..39f814146 100644
--- a/pinecone/__init__.py
+++ b/pinecone/__init__.py
@@ -60,6 +60,14 @@
     "UpdateRequest": ("pinecone.db_data.models", "UpdateRequest"),
     "NamespaceDescription": ("pinecone.core.openapi.db_data.models", "NamespaceDescription"),
     "ImportErrorMode": ("pinecone.db_data.resources.sync.bulk_import", "ImportErrorMode"),
+    "BulkImportValidationResult": (
+        "pinecone.db_data.dataclasses.bulk_import_validation_result",
+        "BulkImportValidationResult",
+    ),
+    "validate_bulk_import": (
+        "pinecone.db_data.resources.sync.bulk_import_validator",
+        "validate_bulk_import_uri",
+    ),
     "FilterBuilder": ("pinecone.db_data.filter_builder", "FilterBuilder"),
     "VectorDictionaryMissingKeysError": (
         "pinecone.db_data.errors",
diff --git a/pinecone/__init__.pyi b/pinecone/__init__.pyi
index 45ca8caf3..10e3e2974 100644
--- a/pinecone/__init__.pyi
+++ b/pinecone/__init__.pyi
@@ -50,6 +50,8 @@ from pinecone.db_data.models import (
 )
 from pinecone.core.openapi.db_data.models import NamespaceDescription
 from pinecone.db_data.resources.sync.bulk_import import ImportErrorMode
+from pinecone.db_data.dataclasses.bulk_import_validation_result import BulkImportValidationResult
+from pinecone.db_data.resources.sync.bulk_import_validator import validate_bulk_import_uri as validate_bulk_import
 from pinecone.db_data.errors import (
     VectorDictionaryMissingKeysError,
     VectorDictionaryExcessKeysError,
diff --git a/pinecone/db_data/dataclasses/__init__.py b/pinecone/db_data/dataclasses/__init__.py
index d6709e8ab..5eb677e6f 100644
--- a/pinecone/db_data/dataclasses/__init__.py
+++ b/pinecone/db_data/dataclasses/__init__.py
@@ -8,6 +8,7 @@
 from .query_response import QueryResponse
 from .upsert_response import UpsertResponse
 from .update_response import UpdateResponse
+from .bulk_import_validation_result import BulkImportValidationResult
 
 __all__ = [
     "SparseValues",
@@ -21,4 +22,5 @@
     "QueryResponse",
     "UpsertResponse",
     "UpdateResponse",
+    "BulkImportValidationResult",
 ]
diff --git a/pinecone/db_data/dataclasses/bulk_import_validation_result.py b/pinecone/db_data/dataclasses/bulk_import_validation_result.py
new file mode 100644
index 000000000..7b4f669c8
--- /dev/null
+++ b/pinecone/db_data/dataclasses/bulk_import_validation_result.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class BulkImportValidationResult:
+    """Result of a bulk import parquet validation check.
+
+    Attributes:
+        is_valid: True if no errors were found.
+        uri: The URI that was validated. Pass directly to ``index.bulk_import.start()``.
+        errors: Blocking issues that would cause the import to fail.
+        warnings: Non-blocking observations (e.g. detected dimension).
+        files_checked: Number of parquet files whose schema was inspected.
+        rows_sampled: Number of data rows checked (0 if schema-only validation).
+    """
+
+    is_valid: bool
+    uri: str = ""
+    errors: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+    files_checked: int = 0
+    rows_sampled: int = 0
+
+    def __repr__(self) -> str:
+        status = "VALID" if self.is_valid else "INVALID"
+        lines = [f"BulkImportValidationResult({status})"]
+        if self.uri:
+            lines.append(f"  uri={self.uri!r}")
+        if self.errors:
+            lines.append(f"  errors ({len(self.errors)}):")
+            for e in self.errors:
+                lines.append(f"    - {e}")
+        if self.warnings:
+            lines.append(f"  warnings ({len(self.warnings)}):")
+            for w in self.warnings:
+                lines.append(f"    - {w}")
+        lines.append(
+            f"  files_checked={self.files_checked}, rows_sampled={self.rows_sampled}"
+        )
+        return "\n".join(lines)
diff --git a/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py b/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py
index 3610e7fec..31e5f9f96 100644
--- a/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py
+++ b/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py
@@ -11,6 +11,10 @@
 )
 
 from ..sync.bulk_import_request_factory import BulkImportRequestFactory
+from ..sync.bulk_import_validator import validate_bulk_import_uri
+from pinecone.db_data.dataclasses.bulk_import_validation_result import (
+    BulkImportValidationResult,
+)
 
 for m in [StartImportResponse, ListImportsResponse, ImportModel]:
     install_json_repr_override(m)
@@ -150,3 +154,23 @@ async def cancel(self, id: str):
         """
         args = BulkImportRequestFactory.cancel_import_args(id=id)
         return await self.__import_operations_api.cancel_bulk_import(**args)
+
+    def validate(
+        self,
+        uri: str,
+        dimension: int | None = None,
+        vector_type: Literal["dense", "sparse"] | None = None,
+        sample_rows: int = 100,
+        verbose: bool = False,
+    ) -> "BulkImportValidationResult":
+        """Validate parquet file(s) for Pinecone bulk import compatibility.
+
+        This method is synchronous; pyarrow does not support async file I/O.
+        For schema-only validation (no data download) pass ``sample_rows=0``.
+
+        See :meth:`pinecone.db_data.resources.sync.bulk_import.BulkImportResource.validate`
+        for full documentation.
+        """
+        return validate_bulk_import_uri(
+            uri, dimension=dimension, vector_type=vector_type, sample_rows=sample_rows, verbose=verbose
+        )
diff --git a/pinecone/db_data/resources/sync/bulk_import.py b/pinecone/db_data/resources/sync/bulk_import.py
index 440cc588c..8047fe9fe 100644
--- a/pinecone/db_data/resources/sync/bulk_import.py
+++ b/pinecone/db_data/resources/sync/bulk_import.py
@@ -11,6 +11,10 @@
 )
 
 from .bulk_import_request_factory import BulkImportRequestFactory, ImportErrorMode
+from .bulk_import_validator import validate_bulk_import_uri
+from pinecone.db_data.dataclasses.bulk_import_validation_result import (
+    BulkImportValidationResult,
+)
 
 for m in [StartImportResponse, ListImportsResponse, ImportModel]:
     install_json_repr_override(m)
@@ -157,3 +161,54 @@ def cancel(self, id: str):
         """
         args = BulkImportRequestFactory.cancel_import_args(id=id)
         return self.__import_operations_api.cancel_bulk_import(**args)
+
+    def validate(
+        self,
+        uri: str,
+        dimension: int | None = None,
+        vector_type: Literal["dense", "sparse"] | None = None,
+        sample_rows: int = 100,
+        verbose: bool = False,
+    ) -> "BulkImportValidationResult":
+        """Validate parquet file(s) for Pinecone bulk import compatibility.
+
+        Reads only the parquet file footer (schema metadata) by default, making
+        this fast even for large remote files. Pass ``sample_rows > 0`` (the
+        default) to also read a small number of rows and check for null IDs,
+        non-finite vector values, and metadata correctness.
+
+        Requires ``pyarrow``. Install with ``pip install 'pinecone[parquet]'``.
+        Remote URIs (``s3://``, ``gs://``, ``az://``) work automatically when
+        the appropriate filesystem library is available in your environment
+        (``pyarrow`` includes built-in S3 support).
+
+        Args:
+            uri: Local path or remote URI. May point to a single ``.parquet``
+                file or a directory/prefix containing multiple files.
+            dimension: Expected vector dimension. A mismatch is reported as an
+                error. When omitted, dimension is inferred from the schema if
+                the file uses a ``fixed_size_list`` type.
+            vector_type: ``"dense"`` or ``"sparse"``. Inferred from column
+                names when omitted.
+            sample_rows: Rows to read for data-level checks. Set to ``0`` for
+                schema-only validation (no data download).
+
+        Returns:
+            :class:`~pinecone.BulkImportValidationResult`
+
+        Examples:
+            >>> result = index.bulk_import.validate("s3://my-bucket/vectors/")
+            >>> if not result.is_valid:
+            ...     for error in result.errors:
+            ...         print(error)
+
+            >>> # Schema-only check — reads only the parquet footer
+            >>> result = index.bulk_import.validate(
+            ...     "s3://my-bucket/vectors/",
+            ...     dimension=1024,
+            ...     sample_rows=0,
+            ... )
+        """
+        return validate_bulk_import_uri(
+            uri, dimension=dimension, vector_type=vector_type, sample_rows=sample_rows, verbose=verbose
+        )
diff --git a/pinecone/db_data/resources/sync/bulk_import_validator.py b/pinecone/db_data/resources/sync/bulk_import_validator.py
new file mode 100644
index 000000000..07f054738
--- /dev/null
+++ b/pinecone/db_data/resources/sync/bulk_import_validator.py
@@ -0,0 +1,409 @@
+from __future__ import annotations
+
+import json
+import math
+from typing import TYPE_CHECKING
+
+from pinecone.db_data.dataclasses.bulk_import_validation_result import (
+    BulkImportValidationResult,
+)
+
+if TYPE_CHECKING:
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
+# Matches Pinecone's documented metadata size limit.
+_MAX_METADATA_BYTES = 40 * 1024
+
+# Scalar types Pinecone accepts as metadata values.
+_VALID_METADATA_SCALAR_TYPES = (str, int, float, bool)
+
+# Columns that have special meaning in bulk import parquet files.
+# Note: sparse indices/values are sub-fields of the 'sparse_values' struct, not top-level columns.
+_KNOWN_COLUMNS = {"id", "values", "sparse_values", "metadata"}
+
+
+def _require_pyarrow() -> None:
+    try:
+        import pyarrow  # noqa: F401
+    except ImportError:
+        raise ImportError(
+            "pyarrow is required for bulk import validation. "
+            "Install it with: pip install 'pinecone[parquet]'"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Type helpers
+# ---------------------------------------------------------------------------
+
+
+def _is_string_type(t) -> bool:
+    import pyarrow as pa
+
+    return bool(pa.types.is_string(t) or pa.types.is_large_string(t))
+
+
+def _is_float_list_type(t) -> bool:
+    import pyarrow as pa
+
+    if pa.types.is_list(t) or pa.types.is_large_list(t) or pa.types.is_fixed_size_list(t):
+        return bool(pa.types.is_floating(t.value_type))
+    return False
+
+
+def _is_integer_list_type(t) -> bool:
+    import pyarrow as pa
+
+    if pa.types.is_list(t) or pa.types.is_large_list(t) or pa.types.is_fixed_size_list(t):
+        return bool(pa.types.is_integer(t.value_type))
+    return False
+
+
+def _fixed_list_size(t) -> int | None:
+    """Return the list size if the type is fixed_size_list, else None."""
+    import pyarrow as pa
+
+    if pa.types.is_fixed_size_list(t):
+        return int(t.list_size)
+    return None
+
+
+def _is_sparse_struct_type(t) -> bool:
+    """Check if t matches STRUCT<indices: LIST<uint>, values: LIST<float>>.
+
+    Confirmed format from real Pinecone parquet files:
+      sparse_values: struct<indices: list<element: uint32>, values: list<element: float>>
+    """
+    import pyarrow as pa
+
+    if not pa.types.is_struct(t):
+        return False
+    field_names = {t.field(i).name for i in range(t.num_fields)}
+    if "indices" not in field_names or "values" not in field_names:
+        return False
+    return bool(
+        _is_integer_list_type(t.field("indices").type)
+        and _is_float_list_type(t.field("values").type)
+    )
+
+
+# ---------------------------------------------------------------------------
+# Schema validation (reads only the parquet footer — no data download)
+# ---------------------------------------------------------------------------
+
+
+def _validate_schema(
+    schema: "pa.Schema",
+    dimension: int | None,
+    vector_type: str | None,
+    errors: list[str],
+    warnings: list[str],
+) -> None:
+    field_names = set(schema.names)
+
+    if "id" not in field_names:
+        errors.append("Missing required column 'id'")
+    else:
+        t = schema.field("id").type
+        if not _is_string_type(t):
+            errors.append(f"Column 'id' must be string type, got {t}")
+
+    has_values = "values" in field_names
+    has_sparse = "sparse_values" in field_names
+
+    is_dense = (vector_type == "dense") if vector_type else has_values
+    is_sparse = (vector_type == "sparse") if vector_type else has_sparse
+
+    if not is_dense and not is_sparse:
+        errors.append(
+            "No vector columns detected. "
+            "Expected a 'values' column (dense) or a 'sparse_values' struct column (sparse)."
+        )
+        return
+
+    if is_dense:
+        if "values" not in field_names:
+            errors.append("Missing required column 'values' for dense vectors")
+        else:
+            t = schema.field("values").type
+            if not _is_float_list_type(t):
+                errors.append(f"Column 'values' must be a list of floats, got {t}")
+            else:
+                schema_dim = _fixed_list_size(t)
+                if schema_dim is not None and dimension is not None and schema_dim != dimension:
+                    errors.append(
+                        f"Vector dimension in schema ({schema_dim}) does not match "
+                        f"expected dimension ({dimension})"
+                    )
+                elif schema_dim is not None and dimension is None:
+                    warnings.append(f"Detected vector dimension from schema: {schema_dim}")
+
+    if is_sparse:
+        if "sparse_values" not in field_names:
+            errors.append("Missing required column 'sparse_values' for sparse vectors")
+        else:
+            t = schema.field("sparse_values").type
+            if not _is_sparse_struct_type(t):
+                errors.append(
+                    f"Column 'sparse_values' must be "
+                    f"STRUCT<indices: LIST<uint32>, values: LIST<float>>, got {t}"
+                )
+
+    if "metadata" in field_names:
+        t = schema.field("metadata").type
+        if not _is_string_type(t):
+            errors.append(
+                f"Column 'metadata' must be a JSON-encoded UTF-8 string, got {t}. "
+                "See https://docs.pinecone.io/guides/index-data/import-data"
+            )
+
+    extra = field_names - _KNOWN_COLUMNS
+    if extra:
+        errors.append(
+            f"Unexpected column(s) {sorted(extra)} — no additional columns are permitted. "
+            "Only 'id', 'values', 'sparse_values', 'sparse_indices', and 'metadata' are allowed."
+        )
+
+
+# ---------------------------------------------------------------------------
+# Data validation (reads a small sample of rows)
+# ---------------------------------------------------------------------------
+
+
+def _is_valid_metadata_value(v) -> bool:
+    if isinstance(v, _VALID_METADATA_SCALAR_TYPES):
+        return True
+    if isinstance(v, list):
+        return all(isinstance(x, str) for x in v)
+    return False
+
+
+def _validate_data_sample(
+    table: "pa.Table",
+    dimension: int | None,
+    errors: list[str],
+    warnings: list[str],
+) -> None:
+    import pyarrow as pa
+
+    if "id" in table.schema.names:
+        id_col = table.column("id")
+        if id_col.null_count > 0:
+            errors.append(f"Found {id_col.null_count} null ID(s)")
+        empty = sum(1 for v in id_col if v.is_valid and v.as_py() == "")
+        if empty:
+            errors.append(f"Found {empty} empty string ID(s)")
+
+    if "values" in table.schema.names:
+        values_col = table.column("values")
+        if values_col.null_count > 0:
+            errors.append(f"Found {values_col.null_count} null vector(s) in 'values'")
+
+        for i, val in enumerate(values_col):
+            if not val.is_valid:
+                continue
+            arr = val.as_py()
+            if arr is None:
+                continue
+            if dimension is not None and len(arr) != dimension:
+                errors.append(
+                    f"Row {i}: vector length {len(arr)} != expected dimension {dimension}"
+                )
+                break
+            non_finite = [x for x in arr if x is None or not math.isfinite(x)]
+            if non_finite:
+                errors.append(f"Row {i}: 'values' contains non-finite value(s) (NaN or Inf)")
+                break
+
+    if "metadata" in table.schema.names:
+        meta_col = table.column("metadata")
+        # Only validate JSON-string metadata; struct columns are validated by the schema check.
+        if pa.types.is_string(meta_col.type) or pa.types.is_large_string(meta_col.type):
+            for i, val in enumerate(meta_col):
+                if not val.is_valid:
+                    continue
+                raw = val.as_py()
+                if raw is None:
+                    continue
+                size = len(raw.encode("utf-8"))
+                if size > _MAX_METADATA_BYTES:
+                    errors.append(
+                        f"Row {i}: metadata size {size} bytes exceeds the 40 KB limit"
+                    )
+                try:
+                    obj = json.loads(raw)
+                except json.JSONDecodeError as e:
+                    errors.append(f"Row {i}: metadata is not valid JSON: {e}")
+                    continue
+                if not isinstance(obj, dict):
+                    errors.append(
+                        f"Row {i}: metadata must be a JSON object, got {type(obj).__name__}"
+                    )
+                    continue
+                valid_fields = {k: v for k, v in obj.items() if _is_valid_metadata_value(v)}
+                if obj and not valid_fields:
+                    warnings.append(
+                        f"Row {i}: metadata has no Pinecone-compatible fields "
+                        "(values must be string, number, bool, or list of strings)"
+                    )
+
+
+# ---------------------------------------------------------------------------
+# File listing
+# ---------------------------------------------------------------------------
+
+
+def _list_parquet_files(uri: str) -> list[str]:
+    """Return all parquet file URIs under a path (handles single file or directory)."""
+    import pyarrow.fs as pafs
+
+    if uri.lower().endswith(".parquet"):
+        return [uri]
+
+    fs, root_path = pafs.FileSystem.from_uri(uri)
+    scheme = (uri.split("://")[0] + "://") if "://" in uri else ""
+
+    root_path = root_path.rstrip("/")
+    selector = pafs.FileSelector(root_path, recursive=True)
+    file_infos = fs.get_file_info(selector)
+
+    result = []
+    for fi in file_infos:
+        if fi.type == pafs.FileType.File and fi.base_name.lower().endswith(".parquet"):
+            result.append(f"{scheme}{fi.path}" if scheme else fi.path)
+
+    return sorted(result)
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+def validate_bulk_import_uri(
+    uri: str,
+    dimension: int | None = None,
+    vector_type: str | None = None,
+    sample_rows: int = 100,
+    verbose: bool = False,
+) -> BulkImportValidationResult:
+    """Validate parquet file(s) at *uri* for Pinecone bulk import compatibility.
+
+    Schema validation reads only the parquet file footer — no vector data is
+    downloaded — making it cheap even for large remote files. When
+    ``sample_rows > 0`` a small number of rows are also read to check for null
+    IDs, non-finite values, and metadata correctness.
+
+    Args:
+        uri: Local path or remote URI (``s3://``, ``gs://``, ``az://``).
+            May point to a single ``.parquet`` file or a directory/prefix
+            containing multiple files.
+        dimension: Expected vector dimension. When provided, any mismatch
+            between the file and this value is reported as an error.
+        vector_type: ``"dense"`` or ``"sparse"``. Inferred from the schema
+            when omitted.
+        sample_rows: Number of rows to read for data-level checks. Set to
+            ``0`` to perform schema-only validation without reading any data.
+        verbose: When ``True``, print per-file progress and a summary to stdout.
+
+    Returns:
+        :class:`BulkImportValidationResult` — pass ``result.uri`` directly to
+        ``index.bulk_import.start()`` if ``result.is_valid`` is ``True``.
+    """
+    _require_pyarrow()
+    import pyarrow.parquet as pq
+
+    errors: list[str] = []
+    warnings: list[str] = []
+    files_checked = 0
+    rows_sampled = 0
+
+    try:
+        parquet_files = _list_parquet_files(uri)
+    except Exception as e:
+        return BulkImportValidationResult(
+            is_valid=False,
+            uri=uri,
+            errors=[f"Failed to access '{uri}': {e}"],
+        )
+
+    if not parquet_files:
+        return BulkImportValidationResult(
+            is_valid=False,
+            uri=uri,
+            errors=[f"No parquet files found at '{uri}'"],
+        )
+
+    total = len(parquet_files)
+    multi = total > 1
+
+    if verbose:
+        print(f"Validating {total} file(s) at {uri} ...")
+
+    ok_count = 0
+    bad_count = 0
+
+    for file_uri in parquet_files:
+        file_errors: list[str] = []
+        file_warnings: list[str] = []
+        index = files_checked + 1
+
+        try:
+            schema = pq.read_schema(file_uri)
+        except Exception as e:
+            msg = f"failed to read parquet schema: {e}"
+            errors.append(f"{file_uri}: {msg}")
+            if verbose:
+                print(f"[{index:>{len(str(total))}}/{total}] BAD  {file_uri}")
+                print(f"         {msg}")
+            files_checked += 1
+            bad_count += 1
+            continue
+
+        _validate_schema(schema, dimension, vector_type, file_errors, file_warnings)
+
+        if sample_rows > 0 and not file_errors:
+            try:
+                columns = [c for c in _KNOWN_COLUMNS if c in schema.names]
+                pf = pq.ParquetFile(file_uri)
+                sample_table = None
+                for batch in pf.iter_batches(batch_size=sample_rows, columns=columns):
+                    import pyarrow as pa
+
+                    sample_table = pa.Table.from_batches([batch])
+                    break
+                if sample_table is not None:
+                    _validate_data_sample(sample_table, dimension, file_errors, file_warnings)
+                    rows_sampled += len(sample_table)
+            except Exception as e:
+                file_warnings.append(f"Could not read sample data: {e}")
+
+        if verbose:
+            status = "BAD " if file_errors else "OK  "
+            print(f"[{index:>{len(str(total))}}/{total}] {status} {file_uri}")
+            for fe in file_errors:
+                print(f"         error:   {fe}")
+            for fw in file_warnings:
+                print(f"         warning: {fw}")
+
+        prefix = f"{file_uri}: " if multi else ""
+        errors.extend(f"{prefix}{e}" for e in file_errors)
+        warnings.extend(f"{prefix}{w}" for w in file_warnings)
+        files_checked += 1
+        if file_errors:
+            bad_count += 1
+        else:
+            ok_count += 1
+
+    if verbose:
+        print(f"\nTotal: {total}  OK: {ok_count}  BAD: {bad_count}  rows sampled: {rows_sampled}")
+
+    return BulkImportValidationResult(
+        is_valid=len(errors) == 0,
+        uri=uri,
+        errors=errors,
+        warnings=warnings,
+        files_checked=files_checked,
+        rows_sampled=rows_sampled,
+    )
diff --git a/pyproject.toml b/pyproject.toml
index f5e06b3ec..06d3efae4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,9 @@ asyncio = [
     "aiohttp>=3.9.0",
     "aiohttp-retry>=2.9.1,<3.0.0",
 ]
+parquet = [
+    "pyarrow>=14.0.0",
+]
 types = [
     "mypy>=1.6.1,<2.0.0",
     "types-urllib3>=1.26.25.14,<1.27.0.0",
diff --git a/tests/unit/data/test_bulk_import_validator.py b/tests/unit/data/test_bulk_import_validator.py
new file mode 100644
index 000000000..0634e15dd
--- /dev/null
+++ b/tests/unit/data/test_bulk_import_validator.py
@@ -0,0 +1,466 @@
+"""Tests for bulk import parquet validation.
+
+Validation logic is adapted from the internal notebook
+``bulk_import_parquet_validate.ipynb``.
+"""
+
+import json
+import math
+import pytest
+
+pytest.importorskip("pyarrow", reason="pyarrow required for bulk import validation")
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from pinecone.db_data.resources.sync.bulk_import_validator import (
+    validate_bulk_import_uri,
+    _validate_schema,
+    _validate_data_sample,
+)
+from pinecone.db_data.dataclasses.bulk_import_validation_result import (
+    BulkImportValidationResult,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def make_schema(fields: dict) -> pa.Schema:
+    """Build a pyarrow Schema from a {name: type} dict."""
+    return pa.schema([pa.field(name, dtype) for name, dtype in fields.items()])
+
+
+def make_dense_schema(dimension: int = 4, float_type=pa.float32()) -> pa.Schema:
+    return make_schema(
+        {
+            "id": pa.string(),
+            "values": pa.list_(float_type),
+        }
+    )
+
+
+def make_fixed_dense_schema(dimension: int = 4) -> pa.Schema:
+    return make_schema(
+        {
+            "id": pa.string(),
+            "values": pa.list_(pa.field("item", pa.float32()), dimension),
+        }
+    )
+
+
+def make_sparse_struct_type() -> pa.StructType:
+    """The exact struct type Pinecone uses for sparse vectors in parquet."""
+    return pa.struct([
+        pa.field("indices", pa.list_(pa.uint32())),
+        pa.field("values", pa.list_(pa.float32())),
+    ])
+
+
+def make_sparse_schema() -> pa.Schema:
+    return make_schema(
+        {
+            "id": pa.string(),
+            "sparse_values": make_sparse_struct_type(),
+        }
+    )
+
+
+def make_table(rows: list[dict], schema: pa.Schema) -> pa.Table:
+    arrays = {}
+    for field in schema:
+        arrays[field.name] = pa.array(
+            [r.get(field.name) for r in rows], type=field.type
+        )
+    return pa.table(arrays, schema=schema)
+
+
+def make_dense_table(
+    n: int = 3,
+    dimension: int = 4,
+    float_type=pa.float32(),
+    bad_id: bool = False,
+    null_id: bool = False,
+    null_vector: bool = False,
+    non_finite: bool = False,
+) -> pa.Table:
+    schema = make_schema({"id": pa.string(), "values": pa.list_(float_type)})
+    ids = [None if (null_id and i == 0) else ("" if (bad_id and i == 0) else f"vec-{i}") for i in range(n)]
+    vectors = []
+    for i in range(n):
+        if null_vector and i == 0:
+            vectors.append(None)
+        elif non_finite and i == 0:
+            vectors.append([float("inf")] + [float(j) for j in range(dimension - 1)])
+        else:
+            vectors.append([float(j) for j in range(dimension)])
+    return pa.table(
+        {"id": pa.array(ids, pa.string()), "values": pa.array(vectors, pa.list_(float_type))},
+    )
+
+
+# ---------------------------------------------------------------------------
+# Schema validation tests
+# ---------------------------------------------------------------------------
+
+
+class TestValidateSchema:
+    def test_valid_dense_schema(self):
+        errors, warnings = [], []
+        _validate_schema(make_dense_schema(), None, None, errors, warnings)
+        assert errors == []
+
+    def test_missing_id(self):
+        schema = make_schema({"values": pa.list_(pa.float32())})
+        errors, warnings = [], []
+        _validate_schema(schema, None, None, errors, warnings)
+        assert any("'id'" in e for e in errors)
+
+    def test_id_wrong_type(self):
+        schema = make_schema({"id": pa.int64(), "values": pa.list_(pa.float32())})
+        errors, warnings = [], []
+        _validate_schema(schema, None, None, errors, warnings)
+        assert any("'id'" in e and "string" in e for e in errors)
+
+    def test_missing_values_for_dense(self):
+        schema = make_schema({"id": pa.string()})
+        errors, warnings = [], []
+        _validate_schema(schema, None, "dense", errors, warnings)
+        assert any("'values'" in e for e in errors)
+
+    def test_values_wrong_type(self):
+        schema = make_schema({"id": pa.string(), "values": pa.list_(pa.string())})
+        errors, warnings = [], []
+        _validate_schema(schema, None, "dense", errors, warnings)
+        assert any("'values'" in e and "float" in e for e in errors)
+
+    def test_values_float64_accepted(self):
+        schema = make_schema({"id": pa.string(), "values": pa.list_(pa.float64())})
+        errors, warnings = [], []
+        _validate_schema(schema, None, "dense", errors, warnings)
+        assert errors == []
+
+    def test_fixed_size_list_dimension_match(self):
+        schema = make_fixed_dense_schema(dimension=4)
+        errors, warnings = [], []
+        _validate_schema(schema, 4, "dense", errors, warnings)
+        assert errors == []
+
+    def test_fixed_size_list_dimension_mismatch(self):
+        schema = make_fixed_dense_schema(dimension=4)
+        errors, warnings = [], []
+        _validate_schema(schema, 8, "dense", errors, warnings)
+        assert any("dimension" in e for e in errors)
+
+    def test_fixed_size_list_dimension_inferred_in_warning(self):
+        schema = make_fixed_dense_schema(dimension=4)
+        errors, warnings = [], []
+        _validate_schema(schema, None, "dense", errors, warnings)
+        assert errors == []
+        assert any("4" in w for w in warnings)
+
+    def test_sparse_schema_valid(self):
+        errors, warnings = [], []
+        _validate_schema(make_sparse_schema(), None, "sparse", errors, warnings)
+        assert errors == []
+
+    def test_sparse_missing_sparse_values(self):
+        schema = make_schema({"id": pa.string()})
+        errors, warnings = [], []
+        _validate_schema(schema, None, "sparse", errors, warnings)
+        assert any("sparse_values" in e for e in errors)
+
+    def test_sparse_wrong_type_flat_list(self):
+        # Old (incorrect) two-column format should be caught
+        schema = make_schema(
+            {
+                "id": pa.string(),
+                "sparse_values": pa.list_(pa.float32()),
+            }
+        )
+        errors, warnings = [], []
+        _validate_schema(schema, None, "sparse", errors, warnings)
+        assert any("STRUCT" in e for e in errors)
+
+    def test_sparse_struct_missing_indices_field(self):
+        bad_struct = pa.struct([pa.field("values", pa.list_(pa.float32()))])
+        schema = make_schema({"id": pa.string(), "sparse_values": bad_struct})
+        errors, warnings = [], []
+        _validate_schema(schema, None, "sparse", errors, warnings)
+        assert any("STRUCT" in e for e in errors)
+
+    def test_sparse_struct_wrong_indices_type(self):
+        bad_struct = pa.struct([
+            pa.field("indices", pa.list_(pa.float32())),   # should be integer
+            pa.field("values", pa.list_(pa.float32())),
+        ])
+        schema = make_schema({"id": pa.string(), "sparse_values": bad_struct})
+        errors, warnings = [], []
+        _validate_schema(schema, None, "sparse", errors, warnings)
+        assert any("STRUCT" in e for e in errors)
+
+    def test_no_vector_columns_detected(self):
+        schema = make_schema({"id": pa.string(), "category": pa.string()})
+        errors, warnings = [], []
+        _validate_schema(schema, None, None, errors, warnings)
+        assert any("No vector columns" in e for e in errors)
+
+    def test_extra_columns_are_error(self):
+        # Docs: "No additional columns permitted"
+        schema = make_schema(
+            {"id": pa.string(), "values": pa.list_(pa.float32()), "source": pa.string()}
+        )
+        errors, warnings = [], []
+        _validate_schema(schema, None, "dense", errors, warnings)
+        assert any("source" in e for e in errors)
+
+    def test_metadata_string_column_ok(self):
+        schema = make_schema(
+            {
+                "id": pa.string(),
+                "values": pa.list_(pa.float32()),
+                "metadata": pa.string(),
+            }
+        )
+        errors, warnings = [], []
+        _validate_schema(schema, None, "dense", errors, warnings)
+        assert errors == []
+
+    def test_metadata_struct_is_error(self):
+        # Docs only document JSON-encoded UTF-8 string for metadata
+        schema = make_schema(
+            {
+                "id": pa.string(),
+                "values": pa.list_(pa.float32()),
+                "metadata": pa.struct([pa.field("genre", pa.string())]),
+            }
+        )
+        errors, warnings = [], []
+        _validate_schema(schema, None, "dense", errors, warnings)
+        assert any("metadata" in e for e in errors)
+
+    def test_metadata_wrong_type_is_error(self):
+        schema = make_schema(
+            {
+                "id": pa.string(),
+                "values": pa.list_(pa.float32()),
+                "metadata": pa.int64(),
+            }
+        )
+        errors, warnings = [], []
+        _validate_schema(schema, None, "dense", errors, warnings)
+        assert any("metadata" in e for e in errors)
+
+
+# ---------------------------------------------------------------------------
+# Data sample validation tests
+# ---------------------------------------------------------------------------
+
+
+class TestValidateDataSample:
+    def test_valid_dense_rows(self):
+        table = make_dense_table(n=5, dimension=4)
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert errors == []
+
+    def test_null_id(self):
+        table = make_dense_table(n=3, null_id=True)
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert any("null" in e.lower() and "ID" in e for e in errors)
+
+    def test_empty_id(self):
+        table = make_dense_table(n=3, bad_id=True)
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert any("empty" in e.lower() for e in errors)
+
+    def test_null_vector(self):
+        table = make_dense_table(n=3, null_vector=True)
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert any("null" in e.lower() and "vector" in e.lower() for e in errors)
+
+    def test_dimension_mismatch(self):
+        table = make_dense_table(n=3, dimension=4)
+        errors, warnings = [], []
+        _validate_data_sample(table, 8, errors, warnings)
+        assert any("dimension" in e.lower() or "length" in e.lower() for e in errors)
+
+    def test_non_finite_value(self):
+        table = make_dense_table(n=3, dimension=4, non_finite=True)
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert any("finite" in e.lower() or "Inf" in e for e in errors)
+
+    def test_nan_value(self):
+        schema = make_schema({"id": pa.string(), "values": pa.list_(pa.float32())})
+        table = pa.table(
+            {
+                "id": pa.array(["a", "b"], pa.string()),
+                "values": pa.array(
+                    [[float("nan"), 1.0, 2.0, 3.0], [0.0, 1.0, 2.0, 3.0]],
+                    pa.list_(pa.float32()),
+                ),
+            }
+        )
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert any("finite" in e.lower() or "NaN" in e for e in errors)
+
+    def test_valid_metadata(self):
+        meta = json.dumps({"genre": "fiction", "year": 2024, "tags": ["a", "b"]})
+        table = pa.table(
+            {
+                "id": pa.array(["a"], pa.string()),
+                "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())),
+                "metadata": pa.array([meta], pa.string()),
+            }
+        )
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert errors == []
+
+    def test_metadata_too_large(self):
+        big_meta = json.dumps({"key": "x" * (41 * 1024)})
+        table = pa.table(
+            {
+                "id": pa.array(["a"], pa.string()),
+                "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())),
+                "metadata": pa.array([big_meta], pa.string()),
+            }
+        )
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert any("40 KB" in e or "limit" in e for e in errors)
+
+    def test_metadata_invalid_json(self):
+        table = pa.table(
+            {
+                "id": pa.array(["a"], pa.string()),
+                "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())),
+                "metadata": pa.array(["not json"], pa.string()),
+            }
+        )
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert any("JSON" in e for e in errors)
+
+    def test_metadata_not_dict(self):
+        table = pa.table(
+            {
+                "id": pa.array(["a"], pa.string()),
+                "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())),
+                "metadata": pa.array([json.dumps([1, 2, 3])], pa.string()),
+            }
+        )
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert any("object" in e.lower() or "dict" in e.lower() for e in errors)
+
+    def test_metadata_no_compatible_fields_warning(self):
+        table = pa.table(
+            {
+                "id": pa.array(["a"], pa.string()),
+                "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())),
+                "metadata": pa.array([json.dumps({"key": [1, 2, 3]})], pa.string()),
+            }
+        )
+        errors, warnings = [], []
+        _validate_data_sample(table, 4, errors, warnings)
+        assert errors == []
+        assert any("compatible" in w.lower() for w in warnings)
+
+
+# ---------------------------------------------------------------------------
+# End-to-end: validate_bulk_import_uri with real parquet files on disk
+# ---------------------------------------------------------------------------
+
+
+class TestValidateBulkImportUri:
+    def test_valid_single_file(self, tmp_path):
+        table = make_dense_table(n=10, dimension=4)
+        path = str(tmp_path / "vectors.parquet")
+        pq.write_table(table, path)
+
+        result = validate_bulk_import_uri(path, dimension=4)
+        assert result.is_valid
+        assert result.files_checked == 1
+        assert result.rows_sampled == 10
+
+    def test_dimension_mismatch_end_to_end(self, tmp_path):
+        table = make_dense_table(n=5, dimension=4)
+        path = str(tmp_path / "vectors.parquet")
+        pq.write_table(table, path)
+
+        result = validate_bulk_import_uri(path, dimension=8)
+        assert not result.is_valid
+        assert any("dimension" in e.lower() or "length" in e.lower() for e in result.errors)
+
+    def test_schema_only_no_rows_sampled(self, tmp_path):
+        table = make_dense_table(n=10, dimension=4)
+        path = str(tmp_path / "vectors.parquet")
+        pq.write_table(table, path)
+
+        result = validate_bulk_import_uri(path, dimension=4, sample_rows=0)
+        assert result.is_valid
+        assert result.rows_sampled == 0
+
+    def test_directory_with_multiple_files(self, tmp_path):
+        for i in range(3):
+            table = make_dense_table(n=5, dimension=4)
+            pq.write_table(table, str(tmp_path / f"part-{i}.parquet"))
+
+        result = validate_bulk_import_uri(str(tmp_path), dimension=4)
+        assert result.is_valid
+        assert result.files_checked == 3
+
+    def test_empty_directory(self, tmp_path):
+        result = validate_bulk_import_uri(str(tmp_path))
+        assert not result.is_valid
+        assert any("No parquet files" in e for e in result.errors)
+
+    def test_missing_id_column_end_to_end(self, tmp_path):
+        table = pa.table({"values": pa.array([[1.0, 2.0], [3.0, 4.0]], pa.list_(pa.float32()))})
+        path = str(tmp_path / "bad.parquet")
+        pq.write_table(table, path)
+
+        result = validate_bulk_import_uri(path)
+        assert not result.is_valid
+        assert any("'id'" in e for e in result.errors)
+
+    def test_result_repr_invalid(self, tmp_path):
+        table = pa.table({"values": pa.array([[1.0, 2.0]], pa.list_(pa.float32()))})
+        path = str(tmp_path / "bad.parquet")
+        pq.write_table(table, path)
+
+        result = validate_bulk_import_uri(path)
+        r = repr(result)
+        assert "INVALID" in r
+        assert "'id'" in r
+
+    def test_result_repr_valid(self, tmp_path):
+        table = make_dense_table(n=2, dimension=4)
+        path = str(tmp_path / "ok.parquet")
+        pq.write_table(table, path)
+
+        result = validate_bulk_import_uri(path, dimension=4)
+        assert "VALID" in repr(result)
+
+    def test_pyarrow_not_installed(self, monkeypatch):
+        import builtins
+
+        real_import = builtins.__import__
+
+        def mock_import(name, *args, **kwargs):
+            if name.startswith("pyarrow"):
+                raise ImportError("No module named 'pyarrow'")
+            return real_import(name, *args, **kwargs)
+
+        monkeypatch.setattr(builtins, "__import__", mock_import)
+
+        with pytest.raises(ImportError, match="pinecone\\[parquet\\]"):
+            validate_bulk_import_uri("/some/path.parquet")
diff --git a/uv.lock b/uv.lock
index 8fc67c1c5..9cb3ce7cc 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1580,7 +1580,7 @@ wheels = [
 
 [[package]]
 name = "pinecone"
-version = "8.1.0"
+version = "8.1.2"
 source = { editable = "." }
 dependencies = [
     { name = "certifi" },
@@ -1630,6 +1630,9 @@ grpc = [
     { name = "protobuf" },
     { name = "protoc-gen-openapiv2" },
 ]
+parquet = [
+    { name = "pyarrow" },
+]
 types = [
     { name = "grpc-stubs" },
     { name = "mypy" },
@@ -1664,6 +1667,7 @@ requires-dist = [
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.0.0,<4.0.0" },
     { name = "protobuf", marker = "extra == 'grpc'", specifier = ">=6.33.0,<7.0.0" },
     { name = "protoc-gen-openapiv2", marker = "extra == 'grpc'", specifier = ">=0.0.1,<0.1.0" },
+    { name = "pyarrow", marker = "extra == 'parquet'", specifier = ">=14.0.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = "==8.2.0" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.25.2,<0.26.0" },
     { name = "pytest-benchmark", marker = "python_full_version < '4' and extra == 'dev'", specifier = "==5.0.0" },
@@ -1688,7 +1692,7 @@ requires-dist = [
     { name = "urllib3-mock", marker = "extra == 'dev'", specifier = "==0.3.3" },
     { name = "vprof", marker = "extra == 'dev'", specifier = ">=0.38,<0.39" },
 ]
-provides-extras = ["grpc", "asyncio", "types", "dev"]
+provides-extras = ["grpc", "asyncio", "parquet", "types", "dev"]
 
 [[package]]
 name = "pinecone-plugin-assistant"
@@ -1923,6 +1927,63 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" },
 ]
 
+[[package]]
+name = "pyarrow"
+version = "24.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a5/bf/a34fee1d624152124fa8355c42f34195ad5fe5233ce5bb87946432047d52/pyarrow-24.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:7c2b98645d576a0b9616892ead22b64a83a5f043c5e2ca15ebcefcb5b70c80cb", size = 35076681, upload-time = "2026-04-21T08:51:46.845Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/41/64180033d7027afce12dc96d0fe1f504c6fa112190582b458acea2399530/pyarrow-24.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:644a246325b8c69c595ad1dd4b463eba4b0cdb731370e4a86137d433208d6147", size = 36684260, upload-time = "2026-04-21T08:51:53.642Z" },
+    { url = "https://files.pythonhosted.org/packages/57/02/9b9320e673dd8a99411fac78690f3df92f6dd6f59754c750110bca66d64e/pyarrow-24.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3a577bd840ca83f646f0a625dbc571dba7044c43c2d1503afc378b570954345c", size = 45698566, upload-time = "2026-04-21T10:46:02.133Z" },
+    { url = "https://files.pythonhosted.org/packages/67/33/f75e91b9a64c3f33c787e263c93b871ad91b8a4a68c1d5cebddd9840e835/pyarrow-24.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:e3268e43984d0b1a185c89b4cfff282a7ead12fc93f56cfd7088bdbcbe727041", size = 48835562, upload-time = "2026-04-21T10:46:10.278Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/63/097510448e47e4091faa41c43ba92f97cecaab8f4535b56a3d149578f634/pyarrow-24.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2392d954fcb920f42d230284b677605e4e2fbb11f2821e823e642abd67fbb491", size = 49394997, upload-time = "2026-04-21T10:46:18.08Z" },
+    { url = "https://files.pythonhosted.org/packages/60/6b/c047d6222ab279024a062742d1807e2fbaf27bba88a98637299ff47b9236/pyarrow-24.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bec9373df11544592b0ba7ec2af0e35059e5f0e7647c6183a854dedd193298f1", size = 51911424, upload-time = "2026-04-21T10:46:25.347Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/ba/464cc70761c2a525d97ebd84e21c31ebd47f3ef4bdcee117009f51c46f24/pyarrow-24.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:c42ab9439498270139cc63e18847a02afe5c8b3ed9c931266533cfe378bd3591", size = 27251730, upload-time = "2026-04-21T10:46:30.913Z" },
+    { url = "https://files.pythonhosted.org/packages/62/c9/a47ab7ece0d86cbe6678418a0fbd1ac4bb493b9184a3891dfa0e7f287ae0/pyarrow-24.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b0e131f880cda8d04e076cee175a46fc0e8bc8b65c99c6c09dff6669335fde74", size = 35068898, upload-time = "2026-04-21T10:46:36.599Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/bc/8db86617a9a58008acf8913d6fed68ea2a46acb6de928db28d724c891a68/pyarrow-24.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:1b2fe7f9a5566401a0ef2571f197eb92358925c1f0c8dba305d6e43ea0871bb3", size = 36679915, upload-time = "2026-04-21T10:46:42.602Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/8e/fb178720400ef69db251eb4a9c3ccf4af269bc1feb5055529b8fc87170d1/pyarrow-24.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:0b3537c00fb8d384f15ac1e79b6eb6db04a16514c8c1d22e59a9b95c8ba42868", size = 45697931, upload-time = "2026-04-21T10:46:48.403Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/27/99c42abe8e21b44f4917f62631f3aa31404882a2c41d8a4cd5c110e13d52/pyarrow-24.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:14e31a3c9e35f1ab6356c6378f6f72830e6d2d5f1791df3774a7b097d18a6a1e", size = 48837449, upload-time = "2026-04-21T10:46:55.329Z" },
+    { url = "https://files.pythonhosted.org/packages/36/b6/333749e2666e9032891125bf9c691146e92901bece62030ac1430e2e7c88/pyarrow-24.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7d9a514e73bc42711e6a35aaccf3587c520024fe0a25d830a1a8a27c15f4f57", size = 49395949, upload-time = "2026-04-21T10:47:01.869Z" },
+    { url = "https://files.pythonhosted.org/packages/17/25/c5201706a2dd374e8ba6ee3fd7a8c89fb7ffc16eed5217a91fd2bd7f7626/pyarrow-24.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b196eb3f931862af3fa84c2a253514d859c08e0d8fe020e07be12e75a5a9780c", size = 51912986, upload-time = "2026-04-21T10:47:09.872Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/d2/4d1bbba65320b21a49678d6fbdc6ff7c649251359fdcfc03568c4136231d/pyarrow-24.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:35405aecb474e683fb36af650618fd5340ee5471fc65a21b36076a18bbc6c981", size = 27255371, upload-time = "2026-04-21T10:47:15.943Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" },
+    { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" },
+    { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" },
+    { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/d3/a1abf004482026ddc17f4503db227787fa3cfe41ec5091ff20e4fea55e57/pyarrow-24.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:02b001b3ed4723caa44f6cd1af2d5c86aa2cf9971dacc2ffa55b21237713dfba", size = 34976759, upload-time = "2026-04-21T10:48:07.258Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/4a/34f0a36d28a2dd32225301b79daad44e243dc1a2bb77d43b60749be255c4/pyarrow-24.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:04920d6a71aabd08a0417709efce97d45ea8e6fb733d9ca9ecffb13c67839f68", size = 36658471, upload-time = "2026-04-21T10:48:13.347Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/78/543b94712ae8bb1a6023bcc1acf1a740fbff8286747c289cd9468fced2a5/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a964266397740257f16f7bb2e4f08a0c81454004beab8ff59dd531b73610e9f2", size = 45675981, upload-time = "2026-04-21T10:48:20.201Z" },
+    { url = "https://files.pythonhosted.org/packages/84/9f/8fb7c222b100d314137fa40ec050de56cd8c6d957d1cfff685ce72f15b17/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6f066b179d68c413374294bc1735f68475457c933258df594443bb9d88ddc2a0", size = 48859172, upload-time = "2026-04-21T10:48:27.541Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/d3/1ea72538e6c8b3b475ed78d1049a2c518e655761ea50fe1171fc855fcab7/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1183baeb14c5f587b1ec52831e665718ce632caab84b7cd6b85fd44f96114495", size = 49385733, upload-time = "2026-04-21T10:48:34.7Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/be/c3d8b06a1ba35f2260f8e1f771abbee7d5e345c0937aab90675706b1690a/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:806f24b4085453c197a5078218d1ee08783ebbba271badd153d1ae22a3ee804f", size = 51934335, upload-time = "2026-04-21T10:48:42.099Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/62/89e07a1e7329d2cde3e3c6994ba0839a24977a2beda8be6005ea3d860b99/pyarrow-24.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e4505fc6583f7b05ab854934896bcac8253b04ac1171a77dfb73efef92076d91", size = 27271748, upload-time = "2026-04-21T10:49:42.532Z" },
+    { url = "https://files.pythonhosted.org/packages/17/1a/cff3a59f80b5b1658549d46611b67163f65e0664431c076ad728bf9d5af4/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:1a4e45017efbf115032e4475ee876d525e0e36c742214fbe405332480ecd6275", size = 35238554, upload-time = "2026-04-21T10:48:48.526Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/99/cce0f42a327bfef2c420fb6078a3eb834826e5d6697bf3009fe11d2ad051/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:7986f1fa71cee060ad00758bcc79d3a93bab8559bf978fab9e53472a2e25a17b", size = 36782301, upload-time = "2026-04-21T10:48:55.181Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/66/8e560d5ff6793ca29aca213c53eec0dd482dd46cb93b2819e5aab52e4252/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:d3e0b61e8efb24ed38898e5cdc5fffa9124be480008d401a1f8071500494ae42", size = 45721929, upload-time = "2026-04-21T10:49:03.676Z" },
+    { url = "https://files.pythonhosted.org/packages/27/0c/a26e25505d030716e078d9f16eb74973cbf0b33b672884e9f9da1c83b871/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:55a3bc1e3df3b5567b7d27ef551b2283f0c68a5e86f1cd56abc569da4f31335b", size = 48825365, upload-time = "2026-04-21T10:49:11.714Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/eb/771f9ecb0c65e73fe9dccdd1717901b9594f08c4515d000c7c62df573811/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:641f795b361874ac9da5294f8f443dfdbee355cf2bd9e3b8d97aaac2306b9b37", size = 49451819, upload-time = "2026-04-21T10:49:21.474Z" },
+    { url = "https://files.pythonhosted.org/packages/48/da/61ae89a88732f5a785646f3ec6125dbb640fa98a540eb2b9889caa561403/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8adc8e6ce5fccf5dc707046ae4914fd537def529709cc0d285d37a7f9cd442ca", size = 51909252, upload-time = "2026-04-21T10:49:31.164Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/1a/8dd5cafab7b66573fa91c03d06d213356ad4edd71813aa75e08ce2b3a844/pyarrow-24.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:9b18371ad2f44044b81a8d23bc2d8a9b6a6226dca775e8e16cfee640473d6c5d", size = 27388127, upload-time = "2026-04-21T10:49:37.334Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/80/d022a34ff05d2cbedd8ccf841fc1f532ecfa9eb5ed1711b56d0e0ea71fc9/pyarrow-24.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:1cc9057f0319e26333b357e17f3c2c022f1a83739b48a88b25bfd5fa2dc18838", size = 35007997, upload-time = "2026-04-21T10:49:48.796Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/ff/f01485fda6f4e5d441afb8dd5e7681e4db18826c1e271852f5d3957d6a80/pyarrow-24.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e6f1278ee4785b6db21229374a1c9e54ec7c549de5d1efc9630b6207de7e170b", size = 36678720, upload-time = "2026-04-21T10:49:55.858Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/c2/2d2d5fea814237923f71b36495211f20b43a1576f9a4d6da7e751a64ec6f/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:adbbedc55506cbdabb830890444fb856bfb0060c46c6f8026c6c2f2cf86ae795", size = 45741852, upload-time = "2026-04-21T10:50:04.624Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/3a/28ba9c1c1ebdbb5f1b94dfebb46f207e52e6a554b7fe4132540fde29a3a0/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ae8a1145af31d903fa9bb166824d7abe9b4681a000b0159c9fb99c11bc11ad26", size = 48889852, upload-time = "2026-04-21T10:50:12.293Z" },
+    { url = "https://files.pythonhosted.org/packages/df/51/4a389acfd31dca009f8fb82d7f510bb4130f2b3a8e18cf00194d0687d8ac/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d7027eba1df3b2069e2e8d80f644fa0918b68c46432af3d088ddd390d063ecde", size = 49445207, upload-time = "2026-04-21T10:50:20.677Z" },
+    { url = "https://files.pythonhosted.org/packages/19/4b/0bab2b23d2ae901b1b9a03c0efd4b2d070256f8ce3fc43f6e58c167b2081/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e56a1ffe9bf7b727432b89104cc0849c21582949dd7bdcb34f17b2001a351a76", size = 51954117, upload-time = "2026-04-21T10:50:29.14Z" },
+    { url = "https://files.pythonhosted.org/packages/29/88/f4e9145da0417b3d2c12035a8492b35ff4a3dbc653e614fcfb51d9dedb38/pyarrow-24.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:38be1808cdd068605b787e6ca9119b27eb275a0234e50212c3492331680c3b1e", size = 28001155, upload-time = "2026-04-21T10:51:22.337Z" },
+    { url = "https://files.pythonhosted.org/packages/79/4f/46a49a63f43526da895b1a45bbb51d5baf8e4d77159f8528fc3e5490007f/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:418e48ce50a45a6a6c73c454677203a9c75c966cb1e92ca3370959185f197a05", size = 35250387, upload-time = "2026-04-21T10:50:35.552Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/da/d5e0cd5ef00796922404806d5f00325cdadc3441ce2c13fe7115f2df9a64/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:2f16197705a230a78270cdd4ea8a1d57e86b2fdcbc34a1f6aebc72e65c986f9a", size = 36797102, upload-time = "2026-04-21T10:50:42.417Z" },
+    { url = "https://files.pythonhosted.org/packages/34/c7/5904145b0a593a05236c882933d439b5720f0a145381179063722fbfc123/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:fb24ac194bfc5e86839d7dcd52092ee31e5fe6733fe11f5e3b06ef0812b20072", size = 45745118, upload-time = "2026-04-21T10:50:49.324Z" },
+    { url = "https://files.pythonhosted.org/packages/13/d3/cca42fe166d1c6e4d5b80e530b7949104d10e17508a90ae202dac205ce2a/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9700ebd9a51f5895ce75ff4ac4b3c47a7d4b42bc618be8e713e5d56bacf5f931", size = 48844765, upload-time = "2026-04-21T10:50:55.579Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/49/942c3b79878ba928324d1e17c274ed84581db8c0a749b24bcf4cbdf15bd3/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d8ddd2768da81d3ee08cfea9b597f4abb4e8e1dc8ae7e204b608d23a0d3ab699", size = 49471890, upload-time = "2026-04-21T10:51:02.439Z" },
+    { url = "https://files.pythonhosted.org/packages/76/97/ff71431000a75d84135a1ace5ca4ba11726a231a8007bbb320a4c54075d5/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:61a3d7eaa97a14768b542f3d284dc6400dd2470d9f080708b13cd46b6ae18136", size = 51932250, upload-time = "2026-04-21T10:51:10.576Z" },
+    { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282, upload-time = "2026-04-21T10:51:16.815Z" },
+]
+
 [[package]]
 name = "pygments"
 version = "2.20.0"

From d125b19b374cf0ba303f04f3f0e245b978e4f5c4 Mon Sep 17 00:00:00 2001
From: Tim Condello <tim@pinecone.io>
Date: Wed, 22 Apr 2026 12:17:47 -0400
Subject: [PATCH 2/3] fix(bulk-import): remove sparse_indices from
 unexpected-column error message

sparse_indices is a sub-field of the sparse_values struct, not a valid
top-level column. Listing it as allowed in the error message contradicted
the validation logic and would confuse users trying to fix their files.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 pinecone/db_data/resources/sync/bulk_import_validator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pinecone/db_data/resources/sync/bulk_import_validator.py b/pinecone/db_data/resources/sync/bulk_import_validator.py
index 07f054738..27cc6cfbf 100644
--- a/pinecone/db_data/resources/sync/bulk_import_validator.py
+++ b/pinecone/db_data/resources/sync/bulk_import_validator.py
@@ -162,7 +162,7 @@ def _validate_schema(
     if extra:
         errors.append(
             f"Unexpected column(s) {sorted(extra)} — no additional columns are permitted. "
-            "Only 'id', 'values', 'sparse_values', 'sparse_indices', and 'metadata' are allowed."
+            "Only 'id', 'values', 'sparse_values', and 'metadata' are allowed."
         )
 
 

From 322b522468e4157977b521ad97761d58a44d4aab Mon Sep 17 00:00:00 2001
From: Tim Condello <tim@pinecone.io>
Date: Wed, 22 Apr 2026 13:45:06 -0400
Subject: [PATCH 3/3] fix(bulk-import): address PR review feedback and fix CI
 failures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add `continue` after metadata size error to prevent double-reporting the same row
- Apply consistent error prefix (respects multi-file flag) to schema-read failures
- Remove quoted return-type annotations on validate() — class is imported at module level
- Add BulkImportValidationResult and validate_bulk_import to __init__.pyi __all__
- Use explicit re-export pattern (import X as X) in __init__.pyi to satisfy ruff F401
- Remove unused TYPE_CHECKING import of pyarrow.parquet in bulk_import_validator.py
- Remove unused imports and variables in test_bulk_import_validator.py (ruff F841/F401)
- Add mypy overrides for pyarrow optional dependency to silence import-not-found errors

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 CLAUDE.md                                     | 69 -------------------
 pinecone/__init__.pyi                         |  4 +-
 .../resources/asyncio/bulk_import_asyncio.py  |  2 +-
 .../db_data/resources/sync/bulk_import.py     |  2 +-
 .../resources/sync/bulk_import_validator.py   |  5 +-
 pyproject.toml                                |  7 ++
 tests/unit/data/test_bulk_import_validator.py |  6 --
 7 files changed, 15 insertions(+), 80 deletions(-)
 delete mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
deleted file mode 100644
index f376d906a..000000000
--- a/CLAUDE.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# CLAUDE.md
-
-This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
-
-## Development Setup
-
-```sh
-uv sync --extra grpc --extra asyncio   # install all dependencies
-uv run pre-commit install               # enable lint/format checks on commit
-```
-
-## Key Commands
-
-```sh
-make test-unit                          # run unit + grpc unit tests
-uv run pytest tests/unit                # REST unit tests only
-uv run pytest tests/unit_grpc           # gRPC unit tests only
-uv run pytest tests/unit/path/to/test_file.py::ClassName::test_method  # single test
-
-uv run mypy pinecone                    # type-check (excludes pinecone/core/)
-uv run ruff check --fix                 # lint
-uv run ruff format                      # format
-
-uv run repl                             # interactive REPL with pre-loaded Pinecone client
-
-make generate-oas                       # regenerate pinecone/core/openapi/ from OpenAPI specs
-```
-
-Integration tests make live Pinecone API calls and incur cost — only Pinecone employees should run them. Set credentials in `.env` (see `.env.example`) before running.
-
-## Architecture
-
-### Layer Overview
-
-```
-Pinecone / PineconeAsyncio   ← public entry point (pinecone/pinecone.py, pinecone_asyncio.py)
-    ├── DBControl             ← index/collection/backup management (pinecone/db_control/)
-    ├── DBData / Index        ← vector upsert/query/fetch/delete (pinecone/db_data/)
-    └── Inference             ← embedding and reranking models (pinecone/inference/)
-```
-
-`Pinecone` and `PineconeAsyncio` are thin facades. Each delegates to `DBControl` (control-plane operations) and returns `Index` / `IndexAsyncio` objects (data-plane operations). Inference is accessible via `pc.inference`.
-
-### Generated Code — Never Edit Manually
-
-`pinecone/core/openapi/` is fully generated from OpenAPI specs via `make generate-oas` (which runs `codegen/build-oas.sh`). The script calls the openapi-generator Docker image, applies several post-processing fixes (underscore field name normalization, datetime coercion removal, shared-class deduplication), then runs `ruff format`. **Do not hand-edit files in `pinecone/core/`.**
-
-Shared OpenAPI utilities (ApiClient, exceptions, model_utils, etc.) live in `pinecone/openapi_support/` rather than being duplicated across the five generated modules (`db_control`, `db_data`, `inference`, `oauth`, `admin`).
-
-### Adapter Layer
-
-`pinecone/adapters/` converts generated OpenAPI response objects into clean SDK dataclasses. This isolates the rest of the SDK from generated-model churn. When a new response type is needed, add it here rather than parsing OpenAPI objects in index.py or other business logic files.
-
-### Sync / Async Split
-
-Every stateful class has a sync and an async variant:
-- `DBControl` / `DBControlAsyncio`
-- `Index` (in `db_data/index.py`) / `IndexAsyncio` (in `db_data/index_asyncio.py`)
-- `Inference` / `AsyncioInference`
-
-The async variants use `aiohttp` (optional extra). The sync variants use `urllib3`. gRPC is a third transport option installed via the `grpc` extra; data-plane integration tests can be toggled to gRPC with `USE_GRPC=true`.
-
-### Lazy Imports
-
-`pinecone/__init__.py` defers most imports through `utils/lazy_imports.py` to keep module startup time fast. When adding new public symbols, register them in the lazy import maps in `__init__.py` rather than adding top-level imports. The `.pyi` stub (`__init__.pyi`) is the authoritative type-visible public API surface and must be kept in sync.
-
-### Testing Philosophy
-
-Unit tests are intentionally sparse — they cover data conversion edge cases (e.g. `VectorFactory`, `QueryResultsAggregator`) but not every method. Most confidence comes from integration tests. When writing unit tests, check `tests/unit/db_data/` for patterns. Fixtures and index setup/teardown for integration tests live in `conftest.py` files at each directory level.
diff --git a/pinecone/__init__.pyi b/pinecone/__init__.pyi
index 10e3e2974..97fa534d2 100644
--- a/pinecone/__init__.pyi
+++ b/pinecone/__init__.pyi
@@ -50,7 +50,7 @@ from pinecone.db_data.models import (
 )
 from pinecone.core.openapi.db_data.models import NamespaceDescription
 from pinecone.db_data.resources.sync.bulk_import import ImportErrorMode
-from pinecone.db_data.dataclasses.bulk_import_validation_result import BulkImportValidationResult
+from pinecone.db_data.dataclasses.bulk_import_validation_result import BulkImportValidationResult as BulkImportValidationResult
 from pinecone.db_data.resources.sync.bulk_import_validator import validate_bulk_import_uri as validate_bulk_import
 from pinecone.db_data.errors import (
     VectorDictionaryMissingKeysError,
@@ -184,6 +184,8 @@ __all__ = [
     "UpdateRequest",
     "NamespaceDescription",
     "ImportErrorMode",
+    "BulkImportValidationResult",
+    "validate_bulk_import",
     "FilterBuilder",
     # Error classes
     "VectorDictionaryMissingKeysError",
diff --git a/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py b/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py
index 31e5f9f96..817e0ff3b 100644
--- a/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py
+++ b/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py
@@ -162,7 +162,7 @@ def validate(
         vector_type: Literal["dense", "sparse"] | None = None,
         sample_rows: int = 100,
         verbose: bool = False,
-    ) -> "BulkImportValidationResult":
+    ) -> BulkImportValidationResult:
         """Validate parquet file(s) for Pinecone bulk import compatibility.
 
         This method is synchronous; pyarrow does not support async file I/O.
diff --git a/pinecone/db_data/resources/sync/bulk_import.py b/pinecone/db_data/resources/sync/bulk_import.py
index 8047fe9fe..483a3c542 100644
--- a/pinecone/db_data/resources/sync/bulk_import.py
+++ b/pinecone/db_data/resources/sync/bulk_import.py
@@ -169,7 +169,7 @@ def validate(
         vector_type: Literal["dense", "sparse"] | None = None,
         sample_rows: int = 100,
         verbose: bool = False,
-    ) -> "BulkImportValidationResult":
+    ) -> BulkImportValidationResult:
         """Validate parquet file(s) for Pinecone bulk import compatibility.
 
         Reads only the parquet file footer (schema metadata) by default, making
diff --git a/pinecone/db_data/resources/sync/bulk_import_validator.py b/pinecone/db_data/resources/sync/bulk_import_validator.py
index 27cc6cfbf..d16c1a0ec 100644
--- a/pinecone/db_data/resources/sync/bulk_import_validator.py
+++ b/pinecone/db_data/resources/sync/bulk_import_validator.py
@@ -10,7 +10,6 @@
 
 if TYPE_CHECKING:
     import pyarrow as pa
-    import pyarrow.parquet as pq
 
 # Matches Pinecone's documented metadata size limit.
 _MAX_METADATA_BYTES = 40 * 1024
@@ -231,6 +230,7 @@ def _validate_data_sample(
                     errors.append(
                         f"Row {i}: metadata size {size} bytes exceeds the 40 KB limit"
                     )
+                    continue
                 try:
                     obj = json.loads(raw)
                 except json.JSONDecodeError as e:
@@ -353,7 +353,8 @@ def validate_bulk_import_uri(
             schema = pq.read_schema(file_uri)
         except Exception as e:
             msg = f"failed to read parquet schema: {e}"
-            errors.append(f"{file_uri}: {msg}")
+            prefix = f"{file_uri}: " if multi else ""
+            errors.append(f"{prefix}{msg}")
             if verbose:
                 print(f"[{index:>{len(str(total))}}/{total}] BAD  {file_uri}")
                 print(f"         {msg}")
diff --git a/pyproject.toml b/pyproject.toml
index 06d3efae4..1cb278c03 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -149,6 +149,13 @@ line-ending = "auto"
 docstring-code-format = false
 docstring-code-line-length = "dynamic"
 
+[tool.mypy]
+ignore_missing_imports = false
+
+[[tool.mypy.overrides]]
+module = ["pyarrow", "pyarrow.*"]
+ignore_missing_imports = true
+
 [tool.ruff.lint.per-file-ignores]
 # F403 Allow star imports
 "__init__.py" = ["F403", "F405"]
diff --git a/tests/unit/data/test_bulk_import_validator.py b/tests/unit/data/test_bulk_import_validator.py
index 0634e15dd..cfd4250b3 100644
--- a/tests/unit/data/test_bulk_import_validator.py
+++ b/tests/unit/data/test_bulk_import_validator.py
@@ -5,7 +5,6 @@
 """
 
 import json
-import math
 import pytest
 
 pytest.importorskip("pyarrow", reason="pyarrow required for bulk import validation")
@@ -18,9 +17,6 @@
     _validate_schema,
     _validate_data_sample,
 )
-from pinecone.db_data.dataclasses.bulk_import_validation_result import (
-    BulkImportValidationResult,
-)
 
 
 # ---------------------------------------------------------------------------
@@ -86,7 +82,6 @@ def make_dense_table(
     null_vector: bool = False,
     non_finite: bool = False,
 ) -> pa.Table:
-    schema = make_schema({"id": pa.string(), "values": pa.list_(float_type)})
     ids = [None if (null_id and i == 0) else ("" if (bad_id and i == 0) else f"vec-{i}") for i in range(n)]
     vectors = []
     for i in range(n):
@@ -297,7 +292,6 @@ def test_non_finite_value(self):
         assert any("finite" in e.lower() or "Inf" in e for e in errors)
 
     def test_nan_value(self):
-        schema = make_schema({"id": pa.string(), "values": pa.list_(pa.float32())})
         table = pa.table(
             {
                 "id": pa.array(["a", "b"], pa.string()),