From a602f42f2b7c977246b1c36312d2bed74b0a7288 Mon Sep 17 00:00:00 2001 From: Tim Condello Date: Tue, 21 Apr 2026 20:26:07 -0400 Subject: [PATCH 1/3] feat(bulk-import): add client-side parquet validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `index.bulk_import.validate(uri)` and the top-level `pinecone.validate_bulk_import(uri)` helper so users can check parquet files for schema and data correctness before sending them to the server. - Reads only the parquet footer (schema) by default — no vector data downloaded even for large remote files - Optionally samples up to N rows to detect null IDs, non-finite values, metadata JSON errors, and the 40 KB metadata size limit - Supports single files and directories via the pyarrow filesystem abstraction (s3://, gs://, az:// URIs work automatically) - Returns BulkImportValidationResult with .is_valid, .errors, .warnings, .files_checked, .rows_sampled; the .uri field can be passed directly to index.bulk_import.start() - Verbose mode prints per-file OK/BAD lines and a final summary - pyarrow is an optional dependency: pip install 'pinecone[parquet]' - 40 unit tests covering schema validation, data sampling, and end-to-end file I/O via real parquet files on disk Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 69 +++ pinecone/__init__.py | 8 + pinecone/__init__.pyi | 2 + pinecone/db_data/dataclasses/__init__.py | 2 + .../bulk_import_validation_result.py | 42 ++ .../resources/asyncio/bulk_import_asyncio.py | 24 + .../db_data/resources/sync/bulk_import.py | 55 +++ .../resources/sync/bulk_import_validator.py | 409 +++++++++++++++ pyproject.toml | 3 + tests/unit/data/test_bulk_import_validator.py | 466 ++++++++++++++++++ uv.lock | 65 ++- 11 files changed, 1143 insertions(+), 2 deletions(-) create mode 100644 CLAUDE.md create mode 100644 pinecone/db_data/dataclasses/bulk_import_validation_result.py create mode 100644 pinecone/db_data/resources/sync/bulk_import_validator.py create mode 100644 tests/unit/data/test_bulk_import_validator.py diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..f376d906a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,69 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Development Setup + +```sh +uv sync --extra grpc --extra asyncio # install all dependencies +uv run pre-commit install # enable lint/format checks on commit +``` + +## Key Commands + +```sh +make test-unit # run unit + grpc unit tests +uv run pytest tests/unit # REST unit tests only +uv run pytest tests/unit_grpc # gRPC unit tests only +uv run pytest tests/unit/path/to/test_file.py::ClassName::test_method # single test + +uv run mypy pinecone # type-check (excludes pinecone/core/) +uv run ruff check --fix # lint +uv run ruff format # format + +uv run repl # interactive REPL with pre-loaded Pinecone client + +make generate-oas # regenerate pinecone/core/openapi/ from OpenAPI specs +``` + +Integration tests make live Pinecone API calls and incur cost — only Pinecone employees should run them. Set credentials in `.env` (see `.env.example`) before running. + +## Architecture + +### Layer Overview + +``` +Pinecone / PineconeAsyncio ← public entry point (pinecone/pinecone.py, pinecone_asyncio.py) + ├── DBControl ← index/collection/backup management (pinecone/db_control/) + ├── DBData / Index ← vector upsert/query/fetch/delete (pinecone/db_data/) + └── Inference ← embedding and reranking models (pinecone/inference/) +``` + +`Pinecone` and `PineconeAsyncio` are thin facades. Each delegates to `DBControl` (control-plane operations) and returns `Index` / `IndexAsyncio` objects (data-plane operations). Inference is accessible via `pc.inference`. + +### Generated Code — Never Edit Manually + +`pinecone/core/openapi/` is fully generated from OpenAPI specs via `make generate-oas` (which runs `codegen/build-oas.sh`). The script calls the openapi-generator Docker image, applies several post-processing fixes (underscore field name normalization, datetime coercion removal, shared-class deduplication), then runs `ruff format`. **Do not hand-edit files in `pinecone/core/`.** + +Shared OpenAPI utilities (ApiClient, exceptions, model_utils, etc.) live in `pinecone/openapi_support/` rather than being duplicated across the five generated modules (`db_control`, `db_data`, `inference`, `oauth`, `admin`). + +### Adapter Layer + +`pinecone/adapters/` converts generated OpenAPI response objects into clean SDK dataclasses. This isolates the rest of the SDK from generated-model churn. When a new response type is needed, add it here rather than parsing OpenAPI objects in index.py or other business logic files. + +### Sync / Async Split + +Every stateful class has a sync and an async variant: +- `DBControl` / `DBControlAsyncio` +- `Index` (in `db_data/index.py`) / `IndexAsyncio` (in `db_data/index_asyncio.py`) +- `Inference` / `AsyncioInference` + +The async variants use `aiohttp` (optional extra). The sync variants use `urllib3`. gRPC is a third transport option installed via the `grpc` extra; data-plane integration tests can be toggled to gRPC with `USE_GRPC=true`. + +### Lazy Imports + +`pinecone/__init__.py` defers most imports through `utils/lazy_imports.py` to keep module startup time fast. When adding new public symbols, register them in the lazy import maps in `__init__.py` rather than adding top-level imports. The `.pyi` stub (`__init__.pyi`) is the authoritative type-visible public API surface and must be kept in sync. + +### Testing Philosophy + +Unit tests are intentionally sparse — they cover data conversion edge cases (e.g. `VectorFactory`, `QueryResultsAggregator`) but not every method. Most confidence comes from integration tests. When writing unit tests, check `tests/unit/db_data/` for patterns. Fixtures and index setup/teardown for integration tests live in `conftest.py` files at each directory level. diff --git a/pinecone/__init__.py b/pinecone/__init__.py index 1064610c2..39f814146 100644 --- a/pinecone/__init__.py +++ b/pinecone/__init__.py @@ -60,6 +60,14 @@ "UpdateRequest": ("pinecone.db_data.models", "UpdateRequest"), "NamespaceDescription": ("pinecone.core.openapi.db_data.models", "NamespaceDescription"), "ImportErrorMode": ("pinecone.db_data.resources.sync.bulk_import", "ImportErrorMode"), + "BulkImportValidationResult": ( + "pinecone.db_data.dataclasses.bulk_import_validation_result", + "BulkImportValidationResult", + ), + "validate_bulk_import": ( + "pinecone.db_data.resources.sync.bulk_import_validator", + "validate_bulk_import_uri", + ), "FilterBuilder": ("pinecone.db_data.filter_builder", "FilterBuilder"), "VectorDictionaryMissingKeysError": ( "pinecone.db_data.errors", diff --git a/pinecone/__init__.pyi b/pinecone/__init__.pyi index 45ca8caf3..10e3e2974 100644 --- a/pinecone/__init__.pyi +++ b/pinecone/__init__.pyi @@ -50,6 +50,8 @@ from pinecone.db_data.models import ( ) from pinecone.core.openapi.db_data.models import NamespaceDescription from pinecone.db_data.resources.sync.bulk_import import ImportErrorMode +from pinecone.db_data.dataclasses.bulk_import_validation_result import BulkImportValidationResult +from pinecone.db_data.resources.sync.bulk_import_validator import validate_bulk_import_uri as validate_bulk_import from pinecone.db_data.errors import ( VectorDictionaryMissingKeysError, VectorDictionaryExcessKeysError, diff --git a/pinecone/db_data/dataclasses/__init__.py b/pinecone/db_data/dataclasses/__init__.py index d6709e8ab..5eb677e6f 100644 --- a/pinecone/db_data/dataclasses/__init__.py +++ b/pinecone/db_data/dataclasses/__init__.py @@ -8,6 +8,7 @@ from .query_response import QueryResponse from .upsert_response import UpsertResponse from .update_response import UpdateResponse +from .bulk_import_validation_result import BulkImportValidationResult __all__ = [ "SparseValues", @@ -21,4 +22,5 @@ "QueryResponse", "UpsertResponse", "UpdateResponse", + "BulkImportValidationResult", ] diff --git a/pinecone/db_data/dataclasses/bulk_import_validation_result.py b/pinecone/db_data/dataclasses/bulk_import_validation_result.py new file mode 100644 index 000000000..7b4f669c8 --- /dev/null +++ b/pinecone/db_data/dataclasses/bulk_import_validation_result.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass +class BulkImportValidationResult: + """Result of a bulk import parquet validation check. + + Attributes: + is_valid: True if no errors were found. + uri: The URI that was validated. Pass directly to ``index.bulk_import.start()``. + errors: Blocking issues that would cause the import to fail. + warnings: Non-blocking observations (e.g. detected dimension). + files_checked: Number of parquet files whose schema was inspected. + rows_sampled: Number of data rows checked (0 if schema-only validation). + """ + + is_valid: bool + uri: str = "" + errors: list[str] = field(default_factory=list) + warnings: list[str] = field(default_factory=list) + files_checked: int = 0 + rows_sampled: int = 0 + + def __repr__(self) -> str: + status = "VALID" if self.is_valid else "INVALID" + lines = [f"BulkImportValidationResult({status})"] + if self.uri: + lines.append(f" uri={self.uri!r}") + if self.errors: + lines.append(f" errors ({len(self.errors)}):") + for e in self.errors: + lines.append(f" - {e}") + if self.warnings: + lines.append(f" warnings ({len(self.warnings)}):") + for w in self.warnings: + lines.append(f" - {w}") + lines.append( + f" files_checked={self.files_checked}, rows_sampled={self.rows_sampled}" + ) + return "\n".join(lines) diff --git a/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py b/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py index 3610e7fec..31e5f9f96 100644 --- a/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py +++ b/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py @@ -11,6 +11,10 @@ ) from ..sync.bulk_import_request_factory import BulkImportRequestFactory +from ..sync.bulk_import_validator import validate_bulk_import_uri +from pinecone.db_data.dataclasses.bulk_import_validation_result import ( + BulkImportValidationResult, +) for m in [StartImportResponse, ListImportsResponse, ImportModel]: install_json_repr_override(m) @@ -150,3 +154,23 @@ async def cancel(self, id: str): """ args = BulkImportRequestFactory.cancel_import_args(id=id) return await self.__import_operations_api.cancel_bulk_import(**args) + + def validate( + self, + uri: str, + dimension: int | None = None, + vector_type: Literal["dense", "sparse"] | None = None, + sample_rows: int = 100, + verbose: bool = False, + ) -> "BulkImportValidationResult": + """Validate parquet file(s) for Pinecone bulk import compatibility. + + This method is synchronous; pyarrow does not support async file I/O. + For schema-only validation (no data download) pass ``sample_rows=0``. + + See :meth:`pinecone.db_data.resources.sync.bulk_import.BulkImportResource.validate` + for full documentation. + """ + return validate_bulk_import_uri( + uri, dimension=dimension, vector_type=vector_type, sample_rows=sample_rows, verbose=verbose + ) diff --git a/pinecone/db_data/resources/sync/bulk_import.py b/pinecone/db_data/resources/sync/bulk_import.py index 440cc588c..8047fe9fe 100644 --- a/pinecone/db_data/resources/sync/bulk_import.py +++ b/pinecone/db_data/resources/sync/bulk_import.py @@ -11,6 +11,10 @@ ) from .bulk_import_request_factory import BulkImportRequestFactory, ImportErrorMode +from .bulk_import_validator import validate_bulk_import_uri +from pinecone.db_data.dataclasses.bulk_import_validation_result import ( + BulkImportValidationResult, +) for m in [StartImportResponse, ListImportsResponse, ImportModel]: install_json_repr_override(m) @@ -157,3 +161,54 @@ def cancel(self, id: str): """ args = BulkImportRequestFactory.cancel_import_args(id=id) return self.__import_operations_api.cancel_bulk_import(**args) + + def validate( + self, + uri: str, + dimension: int | None = None, + vector_type: Literal["dense", "sparse"] | None = None, + sample_rows: int = 100, + verbose: bool = False, + ) -> "BulkImportValidationResult": + """Validate parquet file(s) for Pinecone bulk import compatibility. + + Reads only the parquet file footer (schema metadata) by default, making + this fast even for large remote files. Pass ``sample_rows > 0`` (the + default) to also read a small number of rows and check for null IDs, + non-finite vector values, and metadata correctness. + + Requires ``pyarrow``. Install with ``pip install 'pinecone[parquet]'``. + Remote URIs (``s3://``, ``gs://``, ``az://``) work automatically when + the appropriate filesystem library is available in your environment + (``pyarrow`` includes built-in S3 support). + + Args: + uri: Local path or remote URI. May point to a single ``.parquet`` + file or a directory/prefix containing multiple files. + dimension: Expected vector dimension. A mismatch is reported as an + error. When omitted, dimension is inferred from the schema if + the file uses a ``fixed_size_list`` type. + vector_type: ``"dense"`` or ``"sparse"``. Inferred from column + names when omitted. + sample_rows: Rows to read for data-level checks. Set to ``0`` for + schema-only validation (no data download). + + Returns: + :class:`~pinecone.BulkImportValidationResult` + + Examples: + >>> result = index.bulk_import.validate("s3://my-bucket/vectors/") + >>> if not result.is_valid: + ... for error in result.errors: + ... print(error) + + >>> # Schema-only check — reads only the parquet footer + >>> result = index.bulk_import.validate( + ... "s3://my-bucket/vectors/", + ... dimension=1024, + ... sample_rows=0, + ... ) + """ + return validate_bulk_import_uri( + uri, dimension=dimension, vector_type=vector_type, sample_rows=sample_rows, verbose=verbose + ) diff --git a/pinecone/db_data/resources/sync/bulk_import_validator.py b/pinecone/db_data/resources/sync/bulk_import_validator.py new file mode 100644 index 000000000..07f054738 --- /dev/null +++ b/pinecone/db_data/resources/sync/bulk_import_validator.py @@ -0,0 +1,409 @@ +from __future__ import annotations + +import json +import math +from typing import TYPE_CHECKING + +from pinecone.db_data.dataclasses.bulk_import_validation_result import ( + BulkImportValidationResult, +) + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.parquet as pq + +# Matches Pinecone's documented metadata size limit. +_MAX_METADATA_BYTES = 40 * 1024 + +# Scalar types Pinecone accepts as metadata values. +_VALID_METADATA_SCALAR_TYPES = (str, int, float, bool) + +# Columns that have special meaning in bulk import parquet files. +# Note: sparse indices/values are sub-fields of the 'sparse_values' struct, not top-level columns. +_KNOWN_COLUMNS = {"id", "values", "sparse_values", "metadata"} + + +def _require_pyarrow() -> None: + try: + import pyarrow # noqa: F401 + except ImportError: + raise ImportError( + "pyarrow is required for bulk import validation. " + "Install it with: pip install 'pinecone[parquet]'" + ) + + +# --------------------------------------------------------------------------- +# Type helpers +# --------------------------------------------------------------------------- + + +def _is_string_type(t) -> bool: + import pyarrow as pa + + return bool(pa.types.is_string(t) or pa.types.is_large_string(t)) + + +def _is_float_list_type(t) -> bool: + import pyarrow as pa + + if pa.types.is_list(t) or pa.types.is_large_list(t) or pa.types.is_fixed_size_list(t): + return bool(pa.types.is_floating(t.value_type)) + return False + + +def _is_integer_list_type(t) -> bool: + import pyarrow as pa + + if pa.types.is_list(t) or pa.types.is_large_list(t) or pa.types.is_fixed_size_list(t): + return bool(pa.types.is_integer(t.value_type)) + return False + + +def _fixed_list_size(t) -> int | None: + """Return the list size if the type is fixed_size_list, else None.""" + import pyarrow as pa + + if pa.types.is_fixed_size_list(t): + return int(t.list_size) + return None + + +def _is_sparse_struct_type(t) -> bool: + """Check if t matches STRUCT, values: LIST>. + + Confirmed format from real Pinecone parquet files: + sparse_values: struct, values: list> + """ + import pyarrow as pa + + if not pa.types.is_struct(t): + return False + field_names = {t.field(i).name for i in range(t.num_fields)} + if "indices" not in field_names or "values" not in field_names: + return False + return bool( + _is_integer_list_type(t.field("indices").type) + and _is_float_list_type(t.field("values").type) + ) + + +# --------------------------------------------------------------------------- +# Schema validation (reads only the parquet footer — no data download) +# --------------------------------------------------------------------------- + + +def _validate_schema( + schema: "pa.Schema", + dimension: int | None, + vector_type: str | None, + errors: list[str], + warnings: list[str], +) -> None: + field_names = set(schema.names) + + if "id" not in field_names: + errors.append("Missing required column 'id'") + else: + t = schema.field("id").type + if not _is_string_type(t): + errors.append(f"Column 'id' must be string type, got {t}") + + has_values = "values" in field_names + has_sparse = "sparse_values" in field_names + + is_dense = (vector_type == "dense") if vector_type else has_values + is_sparse = (vector_type == "sparse") if vector_type else has_sparse + + if not is_dense and not is_sparse: + errors.append( + "No vector columns detected. " + "Expected a 'values' column (dense) or a 'sparse_values' struct column (sparse)." + ) + return + + if is_dense: + if "values" not in field_names: + errors.append("Missing required column 'values' for dense vectors") + else: + t = schema.field("values").type + if not _is_float_list_type(t): + errors.append(f"Column 'values' must be a list of floats, got {t}") + else: + schema_dim = _fixed_list_size(t) + if schema_dim is not None and dimension is not None and schema_dim != dimension: + errors.append( + f"Vector dimension in schema ({schema_dim}) does not match " + f"expected dimension ({dimension})" + ) + elif schema_dim is not None and dimension is None: + warnings.append(f"Detected vector dimension from schema: {schema_dim}") + + if is_sparse: + if "sparse_values" not in field_names: + errors.append("Missing required column 'sparse_values' for sparse vectors") + else: + t = schema.field("sparse_values").type + if not _is_sparse_struct_type(t): + errors.append( + f"Column 'sparse_values' must be " + f"STRUCT, values: LIST>, got {t}" + ) + + if "metadata" in field_names: + t = schema.field("metadata").type + if not _is_string_type(t): + errors.append( + f"Column 'metadata' must be a JSON-encoded UTF-8 string, got {t}. " + "See https://docs.pinecone.io/guides/index-data/import-data" + ) + + extra = field_names - _KNOWN_COLUMNS + if extra: + errors.append( + f"Unexpected column(s) {sorted(extra)} — no additional columns are permitted. " + "Only 'id', 'values', 'sparse_values', 'sparse_indices', and 'metadata' are allowed." + ) + + +# --------------------------------------------------------------------------- +# Data validation (reads a small sample of rows) +# --------------------------------------------------------------------------- + + +def _is_valid_metadata_value(v) -> bool: + if isinstance(v, _VALID_METADATA_SCALAR_TYPES): + return True + if isinstance(v, list): + return all(isinstance(x, str) for x in v) + return False + + +def _validate_data_sample( + table: "pa.Table", + dimension: int | None, + errors: list[str], + warnings: list[str], +) -> None: + import pyarrow as pa + + if "id" in table.schema.names: + id_col = table.column("id") + if id_col.null_count > 0: + errors.append(f"Found {id_col.null_count} null ID(s)") + empty = sum(1 for v in id_col if v.is_valid and v.as_py() == "") + if empty: + errors.append(f"Found {empty} empty string ID(s)") + + if "values" in table.schema.names: + values_col = table.column("values") + if values_col.null_count > 0: + errors.append(f"Found {values_col.null_count} null vector(s) in 'values'") + + for i, val in enumerate(values_col): + if not val.is_valid: + continue + arr = val.as_py() + if arr is None: + continue + if dimension is not None and len(arr) != dimension: + errors.append( + f"Row {i}: vector length {len(arr)} != expected dimension {dimension}" + ) + break + non_finite = [x for x in arr if x is None or not math.isfinite(x)] + if non_finite: + errors.append(f"Row {i}: 'values' contains non-finite value(s) (NaN or Inf)") + break + + if "metadata" in table.schema.names: + meta_col = table.column("metadata") + # Only validate JSON-string metadata; struct columns are validated by the schema check. + if pa.types.is_string(meta_col.type) or pa.types.is_large_string(meta_col.type): + for i, val in enumerate(meta_col): + if not val.is_valid: + continue + raw = val.as_py() + if raw is None: + continue + size = len(raw.encode("utf-8")) + if size > _MAX_METADATA_BYTES: + errors.append( + f"Row {i}: metadata size {size} bytes exceeds the 40 KB limit" + ) + try: + obj = json.loads(raw) + except json.JSONDecodeError as e: + errors.append(f"Row {i}: metadata is not valid JSON: {e}") + continue + if not isinstance(obj, dict): + errors.append( + f"Row {i}: metadata must be a JSON object, got {type(obj).__name__}" + ) + continue + valid_fields = {k: v for k, v in obj.items() if _is_valid_metadata_value(v)} + if obj and not valid_fields: + warnings.append( + f"Row {i}: metadata has no Pinecone-compatible fields " + "(values must be string, number, bool, or list of strings)" + ) + + +# --------------------------------------------------------------------------- +# File listing +# --------------------------------------------------------------------------- + + +def _list_parquet_files(uri: str) -> list[str]: + """Return all parquet file URIs under a path (handles single file or directory).""" + import pyarrow.fs as pafs + + if uri.lower().endswith(".parquet"): + return [uri] + + fs, root_path = pafs.FileSystem.from_uri(uri) + scheme = (uri.split("://")[0] + "://") if "://" in uri else "" + + root_path = root_path.rstrip("/") + selector = pafs.FileSelector(root_path, recursive=True) + file_infos = fs.get_file_info(selector) + + result = [] + for fi in file_infos: + if fi.type == pafs.FileType.File and fi.base_name.lower().endswith(".parquet"): + result.append(f"{scheme}{fi.path}" if scheme else fi.path) + + return sorted(result) + + +# --------------------------------------------------------------------------- +# Public entry point +# --------------------------------------------------------------------------- + + +def validate_bulk_import_uri( + uri: str, + dimension: int | None = None, + vector_type: str | None = None, + sample_rows: int = 100, + verbose: bool = False, +) -> BulkImportValidationResult: + """Validate parquet file(s) at *uri* for Pinecone bulk import compatibility. + + Schema validation reads only the parquet file footer — no vector data is + downloaded — making it cheap even for large remote files. When + ``sample_rows > 0`` a small number of rows are also read to check for null + IDs, non-finite values, and metadata correctness. + + Args: + uri: Local path or remote URI (``s3://``, ``gs://``, ``az://``). + May point to a single ``.parquet`` file or a directory/prefix + containing multiple files. + dimension: Expected vector dimension. When provided, any mismatch + between the file and this value is reported as an error. + vector_type: ``"dense"`` or ``"sparse"``. Inferred from the schema + when omitted. + sample_rows: Number of rows to read for data-level checks. Set to + ``0`` to perform schema-only validation without reading any data. + verbose: When ``True``, print per-file progress and a summary to stdout. + + Returns: + :class:`BulkImportValidationResult` — pass ``result.uri`` directly to + ``index.bulk_import.start()`` if ``result.is_valid`` is ``True``. + """ + _require_pyarrow() + import pyarrow.parquet as pq + + errors: list[str] = [] + warnings: list[str] = [] + files_checked = 0 + rows_sampled = 0 + + try: + parquet_files = _list_parquet_files(uri) + except Exception as e: + return BulkImportValidationResult( + is_valid=False, + uri=uri, + errors=[f"Failed to access '{uri}': {e}"], + ) + + if not parquet_files: + return BulkImportValidationResult( + is_valid=False, + uri=uri, + errors=[f"No parquet files found at '{uri}'"], + ) + + total = len(parquet_files) + multi = total > 1 + + if verbose: + print(f"Validating {total} file(s) at {uri} ...") + + ok_count = 0 + bad_count = 0 + + for file_uri in parquet_files: + file_errors: list[str] = [] + file_warnings: list[str] = [] + index = files_checked + 1 + + try: + schema = pq.read_schema(file_uri) + except Exception as e: + msg = f"failed to read parquet schema: {e}" + errors.append(f"{file_uri}: {msg}") + if verbose: + print(f"[{index:>{len(str(total))}}/{total}] BAD {file_uri}") + print(f" {msg}") + files_checked += 1 + bad_count += 1 + continue + + _validate_schema(schema, dimension, vector_type, file_errors, file_warnings) + + if sample_rows > 0 and not file_errors: + try: + columns = [c for c in _KNOWN_COLUMNS if c in schema.names] + pf = pq.ParquetFile(file_uri) + sample_table = None + for batch in pf.iter_batches(batch_size=sample_rows, columns=columns): + import pyarrow as pa + + sample_table = pa.Table.from_batches([batch]) + break + if sample_table is not None: + _validate_data_sample(sample_table, dimension, file_errors, file_warnings) + rows_sampled += len(sample_table) + except Exception as e: + file_warnings.append(f"Could not read sample data: {e}") + + if verbose: + status = "BAD " if file_errors else "OK " + print(f"[{index:>{len(str(total))}}/{total}] {status} {file_uri}") + for fe in file_errors: + print(f" error: {fe}") + for fw in file_warnings: + print(f" warning: {fw}") + + prefix = f"{file_uri}: " if multi else "" + errors.extend(f"{prefix}{e}" for e in file_errors) + warnings.extend(f"{prefix}{w}" for w in file_warnings) + files_checked += 1 + if file_errors: + bad_count += 1 + else: + ok_count += 1 + + if verbose: + print(f"\nTotal: {total} OK: {ok_count} BAD: {bad_count} rows sampled: {rows_sampled}") + + return BulkImportValidationResult( + is_valid=len(errors) == 0, + uri=uri, + errors=errors, + warnings=warnings, + files_checked=files_checked, + rows_sampled=rows_sampled, + ) diff --git a/pyproject.toml b/pyproject.toml index f5e06b3ec..06d3efae4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,9 @@ asyncio = [ "aiohttp>=3.9.0", "aiohttp-retry>=2.9.1,<3.0.0", ] +parquet = [ + "pyarrow>=14.0.0", +] types = [ "mypy>=1.6.1,<2.0.0", "types-urllib3>=1.26.25.14,<1.27.0.0", diff --git a/tests/unit/data/test_bulk_import_validator.py b/tests/unit/data/test_bulk_import_validator.py new file mode 100644 index 000000000..0634e15dd --- /dev/null +++ b/tests/unit/data/test_bulk_import_validator.py @@ -0,0 +1,466 @@ +"""Tests for bulk import parquet validation. + +Validation logic is adapted from the internal notebook +``bulk_import_parquet_validate.ipynb``. +""" + +import json +import math +import pytest + +pytest.importorskip("pyarrow", reason="pyarrow required for bulk import validation") + +import pyarrow as pa +import pyarrow.parquet as pq + +from pinecone.db_data.resources.sync.bulk_import_validator import ( + validate_bulk_import_uri, + _validate_schema, + _validate_data_sample, +) +from pinecone.db_data.dataclasses.bulk_import_validation_result import ( + BulkImportValidationResult, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def make_schema(fields: dict) -> pa.Schema: + """Build a pyarrow Schema from a {name: type} dict.""" + return pa.schema([pa.field(name, dtype) for name, dtype in fields.items()]) + + +def make_dense_schema(dimension: int = 4, float_type=pa.float32()) -> pa.Schema: + return make_schema( + { + "id": pa.string(), + "values": pa.list_(float_type), + } + ) + + +def make_fixed_dense_schema(dimension: int = 4) -> pa.Schema: + return make_schema( + { + "id": pa.string(), + "values": pa.list_(pa.field("item", pa.float32()), dimension), + } + ) + + +def make_sparse_struct_type() -> pa.StructType: + """The exact struct type Pinecone uses for sparse vectors in parquet.""" + return pa.struct([ + pa.field("indices", pa.list_(pa.uint32())), + pa.field("values", pa.list_(pa.float32())), + ]) + + +def make_sparse_schema() -> pa.Schema: + return make_schema( + { + "id": pa.string(), + "sparse_values": make_sparse_struct_type(), + } + ) + + +def make_table(rows: list[dict], schema: pa.Schema) -> pa.Table: + arrays = {} + for field in schema: + arrays[field.name] = pa.array( + [r.get(field.name) for r in rows], type=field.type + ) + return pa.table(arrays, schema=schema) + + +def make_dense_table( + n: int = 3, + dimension: int = 4, + float_type=pa.float32(), + bad_id: bool = False, + null_id: bool = False, + null_vector: bool = False, + non_finite: bool = False, +) -> pa.Table: + schema = make_schema({"id": pa.string(), "values": pa.list_(float_type)}) + ids = [None if (null_id and i == 0) else ("" if (bad_id and i == 0) else f"vec-{i}") for i in range(n)] + vectors = [] + for i in range(n): + if null_vector and i == 0: + vectors.append(None) + elif non_finite and i == 0: + vectors.append([float("inf")] + [float(j) for j in range(dimension - 1)]) + else: + vectors.append([float(j) for j in range(dimension)]) + return pa.table( + {"id": pa.array(ids, pa.string()), "values": pa.array(vectors, pa.list_(float_type))}, + ) + + +# --------------------------------------------------------------------------- +# Schema validation tests +# --------------------------------------------------------------------------- + + +class TestValidateSchema: + def test_valid_dense_schema(self): + errors, warnings = [], [] + _validate_schema(make_dense_schema(), None, None, errors, warnings) + assert errors == [] + + def test_missing_id(self): + schema = make_schema({"values": pa.list_(pa.float32())}) + errors, warnings = [], [] + _validate_schema(schema, None, None, errors, warnings) + assert any("'id'" in e for e in errors) + + def test_id_wrong_type(self): + schema = make_schema({"id": pa.int64(), "values": pa.list_(pa.float32())}) + errors, warnings = [], [] + _validate_schema(schema, None, None, errors, warnings) + assert any("'id'" in e and "string" in e for e in errors) + + def test_missing_values_for_dense(self): + schema = make_schema({"id": pa.string()}) + errors, warnings = [], [] + _validate_schema(schema, None, "dense", errors, warnings) + assert any("'values'" in e for e in errors) + + def test_values_wrong_type(self): + schema = make_schema({"id": pa.string(), "values": pa.list_(pa.string())}) + errors, warnings = [], [] + _validate_schema(schema, None, "dense", errors, warnings) + assert any("'values'" in e and "float" in e for e in errors) + + def test_values_float64_accepted(self): + schema = make_schema({"id": pa.string(), "values": pa.list_(pa.float64())}) + errors, warnings = [], [] + _validate_schema(schema, None, "dense", errors, warnings) + assert errors == [] + + def test_fixed_size_list_dimension_match(self): + schema = make_fixed_dense_schema(dimension=4) + errors, warnings = [], [] + _validate_schema(schema, 4, "dense", errors, warnings) + assert errors == [] + + def test_fixed_size_list_dimension_mismatch(self): + schema = make_fixed_dense_schema(dimension=4) + errors, warnings = [], [] + _validate_schema(schema, 8, "dense", errors, warnings) + assert any("dimension" in e for e in errors) + + def test_fixed_size_list_dimension_inferred_in_warning(self): + schema = make_fixed_dense_schema(dimension=4) + errors, warnings = [], [] + _validate_schema(schema, None, "dense", errors, warnings) + assert errors == [] + assert any("4" in w for w in warnings) + + def test_sparse_schema_valid(self): + errors, warnings = [], [] + _validate_schema(make_sparse_schema(), None, "sparse", errors, warnings) + assert errors == [] + + def test_sparse_missing_sparse_values(self): + schema = make_schema({"id": pa.string()}) + errors, warnings = [], [] + _validate_schema(schema, None, "sparse", errors, warnings) + assert any("sparse_values" in e for e in errors) + + def test_sparse_wrong_type_flat_list(self): + # Old (incorrect) two-column format should be caught + schema = make_schema( + { + "id": pa.string(), + "sparse_values": pa.list_(pa.float32()), + } + ) + errors, warnings = [], [] + _validate_schema(schema, None, "sparse", errors, warnings) + assert any("STRUCT" in e for e in errors) + + def test_sparse_struct_missing_indices_field(self): + bad_struct = pa.struct([pa.field("values", pa.list_(pa.float32()))]) + schema = make_schema({"id": pa.string(), "sparse_values": bad_struct}) + errors, warnings = [], [] + _validate_schema(schema, None, "sparse", errors, warnings) + assert any("STRUCT" in e for e in errors) + + def test_sparse_struct_wrong_indices_type(self): + bad_struct = pa.struct([ + pa.field("indices", pa.list_(pa.float32())), # should be integer + pa.field("values", pa.list_(pa.float32())), + ]) + schema = make_schema({"id": pa.string(), "sparse_values": bad_struct}) + errors, warnings = [], [] + _validate_schema(schema, None, "sparse", errors, warnings) + assert any("STRUCT" in e for e in errors) + + def test_no_vector_columns_detected(self): + schema = make_schema({"id": pa.string(), "category": pa.string()}) + errors, warnings = [], [] + _validate_schema(schema, None, None, errors, warnings) + assert any("No vector columns" in e for e in errors) + + def test_extra_columns_are_error(self): + # Docs: "No additional columns permitted" + schema = make_schema( + {"id": pa.string(), "values": pa.list_(pa.float32()), "source": pa.string()} + ) + errors, warnings = [], [] + _validate_schema(schema, None, "dense", errors, warnings) + assert any("source" in e for e in errors) + + def test_metadata_string_column_ok(self): + schema = make_schema( + { + "id": pa.string(), + "values": pa.list_(pa.float32()), + "metadata": pa.string(), + } + ) + errors, warnings = [], [] + _validate_schema(schema, None, "dense", errors, warnings) + assert errors == [] + + def test_metadata_struct_is_error(self): + # Docs only document JSON-encoded UTF-8 string for metadata + schema = make_schema( + { + "id": pa.string(), + "values": pa.list_(pa.float32()), + "metadata": pa.struct([pa.field("genre", pa.string())]), + } + ) + errors, warnings = [], [] + _validate_schema(schema, None, "dense", errors, warnings) + assert any("metadata" in e for e in errors) + + def test_metadata_wrong_type_is_error(self): + schema = make_schema( + { + "id": pa.string(), + "values": pa.list_(pa.float32()), + "metadata": pa.int64(), + } + ) + errors, warnings = [], [] + _validate_schema(schema, None, "dense", errors, warnings) + assert any("metadata" in e for e in errors) + + +# --------------------------------------------------------------------------- +# Data sample validation tests +# --------------------------------------------------------------------------- + + +class TestValidateDataSample: + def test_valid_dense_rows(self): + table = make_dense_table(n=5, dimension=4) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert errors == [] + + def test_null_id(self): + table = make_dense_table(n=3, null_id=True) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert any("null" in e.lower() and "ID" in e for e in errors) + + def test_empty_id(self): + table = make_dense_table(n=3, bad_id=True) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert any("empty" in e.lower() for e in errors) + + def test_null_vector(self): + table = make_dense_table(n=3, null_vector=True) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert any("null" in e.lower() and "vector" in e.lower() for e in errors) + + def test_dimension_mismatch(self): + table = make_dense_table(n=3, dimension=4) + errors, warnings = [], [] + _validate_data_sample(table, 8, errors, warnings) + assert any("dimension" in e.lower() or "length" in e.lower() for e in errors) + + def test_non_finite_value(self): + table = make_dense_table(n=3, dimension=4, non_finite=True) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert any("finite" in e.lower() or "Inf" in e for e in errors) + + def test_nan_value(self): + schema = make_schema({"id": pa.string(), "values": pa.list_(pa.float32())}) + table = pa.table( + { + "id": pa.array(["a", "b"], pa.string()), + "values": pa.array( + [[float("nan"), 1.0, 2.0, 3.0], [0.0, 1.0, 2.0, 3.0]], + pa.list_(pa.float32()), + ), + } + ) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert any("finite" in e.lower() or "NaN" in e for e in errors) + + def test_valid_metadata(self): + meta = json.dumps({"genre": "fiction", "year": 2024, "tags": ["a", "b"]}) + table = pa.table( + { + "id": pa.array(["a"], pa.string()), + "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())), + "metadata": pa.array([meta], pa.string()), + } + ) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert errors == [] + + def test_metadata_too_large(self): + big_meta = json.dumps({"key": "x" * (41 * 1024)}) + table = pa.table( + { + "id": pa.array(["a"], pa.string()), + "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())), + "metadata": pa.array([big_meta], pa.string()), + } + ) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert any("40 KB" in e or "limit" in e for e in errors) + + def test_metadata_invalid_json(self): + table = pa.table( + { + "id": pa.array(["a"], pa.string()), + "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())), + "metadata": pa.array(["not json"], pa.string()), + } + ) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert any("JSON" in e for e in errors) + + def test_metadata_not_dict(self): + table = pa.table( + { + "id": pa.array(["a"], pa.string()), + "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())), + "metadata": pa.array([json.dumps([1, 2, 3])], pa.string()), + } + ) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert any("object" in e.lower() or "dict" in e.lower() for e in errors) + + def test_metadata_no_compatible_fields_warning(self): + table = pa.table( + { + "id": pa.array(["a"], pa.string()), + "values": pa.array([[1.0, 2.0, 3.0, 4.0]], pa.list_(pa.float32())), + "metadata": pa.array([json.dumps({"key": [1, 2, 3]})], pa.string()), + } + ) + errors, warnings = [], [] + _validate_data_sample(table, 4, errors, warnings) + assert errors == [] + assert any("compatible" in w.lower() for w in warnings) + + +# --------------------------------------------------------------------------- +# End-to-end: validate_bulk_import_uri with real parquet files on disk +# --------------------------------------------------------------------------- + + +class TestValidateBulkImportUri: + def test_valid_single_file(self, tmp_path): + table = make_dense_table(n=10, dimension=4) + path = str(tmp_path / "vectors.parquet") + pq.write_table(table, path) + + result = validate_bulk_import_uri(path, dimension=4) + assert result.is_valid + assert result.files_checked == 1 + assert result.rows_sampled == 10 + + def test_dimension_mismatch_end_to_end(self, tmp_path): + table = make_dense_table(n=5, dimension=4) + path = str(tmp_path / "vectors.parquet") + pq.write_table(table, path) + + result = validate_bulk_import_uri(path, dimension=8) + assert not result.is_valid + assert any("dimension" in e.lower() or "length" in e.lower() for e in result.errors) + + def test_schema_only_no_rows_sampled(self, tmp_path): + table = make_dense_table(n=10, dimension=4) + path = str(tmp_path / "vectors.parquet") + pq.write_table(table, path) + + result = validate_bulk_import_uri(path, dimension=4, sample_rows=0) + assert result.is_valid + assert result.rows_sampled == 0 + + def test_directory_with_multiple_files(self, tmp_path): + for i in range(3): + table = make_dense_table(n=5, dimension=4) + pq.write_table(table, str(tmp_path / f"part-{i}.parquet")) + + result = validate_bulk_import_uri(str(tmp_path), dimension=4) + assert result.is_valid + assert result.files_checked == 3 + + def test_empty_directory(self, tmp_path): + result = validate_bulk_import_uri(str(tmp_path)) + assert not result.is_valid + assert any("No parquet files" in e for e in result.errors) + + def test_missing_id_column_end_to_end(self, tmp_path): + table = pa.table({"values": pa.array([[1.0, 2.0], [3.0, 4.0]], pa.list_(pa.float32()))}) + path = str(tmp_path / "bad.parquet") + pq.write_table(table, path) + + result = validate_bulk_import_uri(path) + assert not result.is_valid + assert any("'id'" in e for e in result.errors) + + def test_result_repr_invalid(self, tmp_path): + table = pa.table({"values": pa.array([[1.0, 2.0]], pa.list_(pa.float32()))}) + path = str(tmp_path / "bad.parquet") + pq.write_table(table, path) + + result = validate_bulk_import_uri(path) + r = repr(result) + assert "INVALID" in r + assert "'id'" in r + + def test_result_repr_valid(self, tmp_path): + table = make_dense_table(n=2, dimension=4) + path = str(tmp_path / "ok.parquet") + pq.write_table(table, path) + + result = validate_bulk_import_uri(path, dimension=4) + assert "VALID" in repr(result) + + def test_pyarrow_not_installed(self, monkeypatch): + import builtins + + real_import = builtins.__import__ + + def mock_import(name, *args, **kwargs): + if name.startswith("pyarrow"): + raise ImportError("No module named 'pyarrow'") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", mock_import) + + with pytest.raises(ImportError, match="pinecone\\[parquet\\]"): + validate_bulk_import_uri("/some/path.parquet") diff --git a/uv.lock b/uv.lock index 8fc67c1c5..9cb3ce7cc 100644 --- a/uv.lock +++ b/uv.lock @@ -1580,7 +1580,7 @@ wheels = [ [[package]] name = "pinecone" -version = "8.1.0" +version = "8.1.2" source = { editable = "." } dependencies = [ { name = "certifi" }, @@ -1630,6 +1630,9 @@ grpc = [ { name = "protobuf" }, { name = "protoc-gen-openapiv2" }, ] +parquet = [ + { name = "pyarrow" }, +] types = [ { name = "grpc-stubs" }, { name = "mypy" }, @@ -1664,6 +1667,7 @@ requires-dist = [ { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=3.0.0,<4.0.0" }, { name = "protobuf", marker = "extra == 'grpc'", specifier = ">=6.33.0,<7.0.0" }, { name = "protoc-gen-openapiv2", marker = "extra == 'grpc'", specifier = ">=0.0.1,<0.1.0" }, + { name = "pyarrow", marker = "extra == 'parquet'", specifier = ">=14.0.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = "==8.2.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.25.2,<0.26.0" }, { name = "pytest-benchmark", marker = "python_full_version < '4' and extra == 'dev'", specifier = "==5.0.0" }, @@ -1688,7 +1692,7 @@ requires-dist = [ { name = "urllib3-mock", marker = "extra == 'dev'", specifier = "==0.3.3" }, { name = "vprof", marker = "extra == 'dev'", specifier = ">=0.38,<0.39" }, ] -provides-extras = ["grpc", "asyncio", "types", "dev"] +provides-extras = ["grpc", "asyncio", "parquet", "types", "dev"] [[package]] name = "pinecone-plugin-assistant" @@ -1923,6 +1927,63 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" }, ] +[[package]] +name = "pyarrow" +version = "24.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a5/bf/a34fee1d624152124fa8355c42f34195ad5fe5233ce5bb87946432047d52/pyarrow-24.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:7c2b98645d576a0b9616892ead22b64a83a5f043c5e2ca15ebcefcb5b70c80cb", size = 35076681, upload-time = "2026-04-21T08:51:46.845Z" }, + { url = "https://files.pythonhosted.org/packages/1d/41/64180033d7027afce12dc96d0fe1f504c6fa112190582b458acea2399530/pyarrow-24.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:644a246325b8c69c595ad1dd4b463eba4b0cdb731370e4a86137d433208d6147", size = 36684260, upload-time = "2026-04-21T08:51:53.642Z" }, + { url = "https://files.pythonhosted.org/packages/57/02/9b9320e673dd8a99411fac78690f3df92f6dd6f59754c750110bca66d64e/pyarrow-24.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3a577bd840ca83f646f0a625dbc571dba7044c43c2d1503afc378b570954345c", size = 45698566, upload-time = "2026-04-21T10:46:02.133Z" }, + { url = "https://files.pythonhosted.org/packages/67/33/f75e91b9a64c3f33c787e263c93b871ad91b8a4a68c1d5cebddd9840e835/pyarrow-24.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:e3268e43984d0b1a185c89b4cfff282a7ead12fc93f56cfd7088bdbcbe727041", size = 48835562, upload-time = "2026-04-21T10:46:10.278Z" }, + { url = "https://files.pythonhosted.org/packages/a5/63/097510448e47e4091faa41c43ba92f97cecaab8f4535b56a3d149578f634/pyarrow-24.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2392d954fcb920f42d230284b677605e4e2fbb11f2821e823e642abd67fbb491", size = 49394997, upload-time = "2026-04-21T10:46:18.08Z" }, + { url = "https://files.pythonhosted.org/packages/60/6b/c047d6222ab279024a062742d1807e2fbaf27bba88a98637299ff47b9236/pyarrow-24.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bec9373df11544592b0ba7ec2af0e35059e5f0e7647c6183a854dedd193298f1", size = 51911424, upload-time = "2026-04-21T10:46:25.347Z" }, + { url = "https://files.pythonhosted.org/packages/3a/ba/464cc70761c2a525d97ebd84e21c31ebd47f3ef4bdcee117009f51c46f24/pyarrow-24.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:c42ab9439498270139cc63e18847a02afe5c8b3ed9c931266533cfe378bd3591", size = 27251730, upload-time = "2026-04-21T10:46:30.913Z" }, + { url = "https://files.pythonhosted.org/packages/62/c9/a47ab7ece0d86cbe6678418a0fbd1ac4bb493b9184a3891dfa0e7f287ae0/pyarrow-24.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b0e131f880cda8d04e076cee175a46fc0e8bc8b65c99c6c09dff6669335fde74", size = 35068898, upload-time = "2026-04-21T10:46:36.599Z" }, + { url = "https://files.pythonhosted.org/packages/d1/bc/8db86617a9a58008acf8913d6fed68ea2a46acb6de928db28d724c891a68/pyarrow-24.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:1b2fe7f9a5566401a0ef2571f197eb92358925c1f0c8dba305d6e43ea0871bb3", size = 36679915, upload-time = "2026-04-21T10:46:42.602Z" }, + { url = "https://files.pythonhosted.org/packages/eb/8e/fb178720400ef69db251eb4a9c3ccf4af269bc1feb5055529b8fc87170d1/pyarrow-24.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:0b3537c00fb8d384f15ac1e79b6eb6db04a16514c8c1d22e59a9b95c8ba42868", size = 45697931, upload-time = "2026-04-21T10:46:48.403Z" }, + { url = "https://files.pythonhosted.org/packages/f3/27/99c42abe8e21b44f4917f62631f3aa31404882a2c41d8a4cd5c110e13d52/pyarrow-24.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:14e31a3c9e35f1ab6356c6378f6f72830e6d2d5f1791df3774a7b097d18a6a1e", size = 48837449, upload-time = "2026-04-21T10:46:55.329Z" }, + { url = "https://files.pythonhosted.org/packages/36/b6/333749e2666e9032891125bf9c691146e92901bece62030ac1430e2e7c88/pyarrow-24.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7d9a514e73bc42711e6a35aaccf3587c520024fe0a25d830a1a8a27c15f4f57", size = 49395949, upload-time = "2026-04-21T10:47:01.869Z" }, + { url = "https://files.pythonhosted.org/packages/17/25/c5201706a2dd374e8ba6ee3fd7a8c89fb7ffc16eed5217a91fd2bd7f7626/pyarrow-24.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b196eb3f931862af3fa84c2a253514d859c08e0d8fe020e07be12e75a5a9780c", size = 51912986, upload-time = "2026-04-21T10:47:09.872Z" }, + { url = "https://files.pythonhosted.org/packages/f8/d2/4d1bbba65320b21a49678d6fbdc6ff7c649251359fdcfc03568c4136231d/pyarrow-24.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:35405aecb474e683fb36af650618fd5340ee5471fc65a21b36076a18bbc6c981", size = 27255371, upload-time = "2026-04-21T10:47:15.943Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" }, + { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" }, + { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" }, + { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" }, + { url = "https://files.pythonhosted.org/packages/6f/d3/a1abf004482026ddc17f4503db227787fa3cfe41ec5091ff20e4fea55e57/pyarrow-24.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:02b001b3ed4723caa44f6cd1af2d5c86aa2cf9971dacc2ffa55b21237713dfba", size = 34976759, upload-time = "2026-04-21T10:48:07.258Z" }, + { url = "https://files.pythonhosted.org/packages/4f/4a/34f0a36d28a2dd32225301b79daad44e243dc1a2bb77d43b60749be255c4/pyarrow-24.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:04920d6a71aabd08a0417709efce97d45ea8e6fb733d9ca9ecffb13c67839f68", size = 36658471, upload-time = "2026-04-21T10:48:13.347Z" }, + { url = "https://files.pythonhosted.org/packages/1f/78/543b94712ae8bb1a6023bcc1acf1a740fbff8286747c289cd9468fced2a5/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a964266397740257f16f7bb2e4f08a0c81454004beab8ff59dd531b73610e9f2", size = 45675981, upload-time = "2026-04-21T10:48:20.201Z" }, + { url = "https://files.pythonhosted.org/packages/84/9f/8fb7c222b100d314137fa40ec050de56cd8c6d957d1cfff685ce72f15b17/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6f066b179d68c413374294bc1735f68475457c933258df594443bb9d88ddc2a0", size = 48859172, upload-time = "2026-04-21T10:48:27.541Z" }, + { url = "https://files.pythonhosted.org/packages/a7/d3/1ea72538e6c8b3b475ed78d1049a2c518e655761ea50fe1171fc855fcab7/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1183baeb14c5f587b1ec52831e665718ce632caab84b7cd6b85fd44f96114495", size = 49385733, upload-time = "2026-04-21T10:48:34.7Z" }, + { url = "https://files.pythonhosted.org/packages/c3/be/c3d8b06a1ba35f2260f8e1f771abbee7d5e345c0937aab90675706b1690a/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:806f24b4085453c197a5078218d1ee08783ebbba271badd153d1ae22a3ee804f", size = 51934335, upload-time = "2026-04-21T10:48:42.099Z" }, + { url = "https://files.pythonhosted.org/packages/9c/62/89e07a1e7329d2cde3e3c6994ba0839a24977a2beda8be6005ea3d860b99/pyarrow-24.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e4505fc6583f7b05ab854934896bcac8253b04ac1171a77dfb73efef92076d91", size = 27271748, upload-time = "2026-04-21T10:49:42.532Z" }, + { url = "https://files.pythonhosted.org/packages/17/1a/cff3a59f80b5b1658549d46611b67163f65e0664431c076ad728bf9d5af4/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:1a4e45017efbf115032e4475ee876d525e0e36c742214fbe405332480ecd6275", size = 35238554, upload-time = "2026-04-21T10:48:48.526Z" }, + { url = "https://files.pythonhosted.org/packages/a8/99/cce0f42a327bfef2c420fb6078a3eb834826e5d6697bf3009fe11d2ad051/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:7986f1fa71cee060ad00758bcc79d3a93bab8559bf978fab9e53472a2e25a17b", size = 36782301, upload-time = "2026-04-21T10:48:55.181Z" }, + { url = "https://files.pythonhosted.org/packages/2a/66/8e560d5ff6793ca29aca213c53eec0dd482dd46cb93b2819e5aab52e4252/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:d3e0b61e8efb24ed38898e5cdc5fffa9124be480008d401a1f8071500494ae42", size = 45721929, upload-time = "2026-04-21T10:49:03.676Z" }, + { url = "https://files.pythonhosted.org/packages/27/0c/a26e25505d030716e078d9f16eb74973cbf0b33b672884e9f9da1c83b871/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:55a3bc1e3df3b5567b7d27ef551b2283f0c68a5e86f1cd56abc569da4f31335b", size = 48825365, upload-time = "2026-04-21T10:49:11.714Z" }, + { url = "https://files.pythonhosted.org/packages/5f/eb/771f9ecb0c65e73fe9dccdd1717901b9594f08c4515d000c7c62df573811/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:641f795b361874ac9da5294f8f443dfdbee355cf2bd9e3b8d97aaac2306b9b37", size = 49451819, upload-time = "2026-04-21T10:49:21.474Z" }, + { url = "https://files.pythonhosted.org/packages/48/da/61ae89a88732f5a785646f3ec6125dbb640fa98a540eb2b9889caa561403/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8adc8e6ce5fccf5dc707046ae4914fd537def529709cc0d285d37a7f9cd442ca", size = 51909252, upload-time = "2026-04-21T10:49:31.164Z" }, + { url = "https://files.pythonhosted.org/packages/cb/1a/8dd5cafab7b66573fa91c03d06d213356ad4edd71813aa75e08ce2b3a844/pyarrow-24.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:9b18371ad2f44044b81a8d23bc2d8a9b6a6226dca775e8e16cfee640473d6c5d", size = 27388127, upload-time = "2026-04-21T10:49:37.334Z" }, + { url = "https://files.pythonhosted.org/packages/ad/80/d022a34ff05d2cbedd8ccf841fc1f532ecfa9eb5ed1711b56d0e0ea71fc9/pyarrow-24.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:1cc9057f0319e26333b357e17f3c2c022f1a83739b48a88b25bfd5fa2dc18838", size = 35007997, upload-time = "2026-04-21T10:49:48.796Z" }, + { url = "https://files.pythonhosted.org/packages/1a/ff/f01485fda6f4e5d441afb8dd5e7681e4db18826c1e271852f5d3957d6a80/pyarrow-24.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e6f1278ee4785b6db21229374a1c9e54ec7c549de5d1efc9630b6207de7e170b", size = 36678720, upload-time = "2026-04-21T10:49:55.858Z" }, + { url = "https://files.pythonhosted.org/packages/9e/c2/2d2d5fea814237923f71b36495211f20b43a1576f9a4d6da7e751a64ec6f/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:adbbedc55506cbdabb830890444fb856bfb0060c46c6f8026c6c2f2cf86ae795", size = 45741852, upload-time = "2026-04-21T10:50:04.624Z" }, + { url = "https://files.pythonhosted.org/packages/8e/3a/28ba9c1c1ebdbb5f1b94dfebb46f207e52e6a554b7fe4132540fde29a3a0/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ae8a1145af31d903fa9bb166824d7abe9b4681a000b0159c9fb99c11bc11ad26", size = 48889852, upload-time = "2026-04-21T10:50:12.293Z" }, + { url = "https://files.pythonhosted.org/packages/df/51/4a389acfd31dca009f8fb82d7f510bb4130f2b3a8e18cf00194d0687d8ac/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d7027eba1df3b2069e2e8d80f644fa0918b68c46432af3d088ddd390d063ecde", size = 49445207, upload-time = "2026-04-21T10:50:20.677Z" }, + { url = "https://files.pythonhosted.org/packages/19/4b/0bab2b23d2ae901b1b9a03c0efd4b2d070256f8ce3fc43f6e58c167b2081/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e56a1ffe9bf7b727432b89104cc0849c21582949dd7bdcb34f17b2001a351a76", size = 51954117, upload-time = "2026-04-21T10:50:29.14Z" }, + { url = "https://files.pythonhosted.org/packages/29/88/f4e9145da0417b3d2c12035a8492b35ff4a3dbc653e614fcfb51d9dedb38/pyarrow-24.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:38be1808cdd068605b787e6ca9119b27eb275a0234e50212c3492331680c3b1e", size = 28001155, upload-time = "2026-04-21T10:51:22.337Z" }, + { url = "https://files.pythonhosted.org/packages/79/4f/46a49a63f43526da895b1a45bbb51d5baf8e4d77159f8528fc3e5490007f/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:418e48ce50a45a6a6c73c454677203a9c75c966cb1e92ca3370959185f197a05", size = 35250387, upload-time = "2026-04-21T10:50:35.552Z" }, + { url = "https://files.pythonhosted.org/packages/a0/da/d5e0cd5ef00796922404806d5f00325cdadc3441ce2c13fe7115f2df9a64/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:2f16197705a230a78270cdd4ea8a1d57e86b2fdcbc34a1f6aebc72e65c986f9a", size = 36797102, upload-time = "2026-04-21T10:50:42.417Z" }, + { url = "https://files.pythonhosted.org/packages/34/c7/5904145b0a593a05236c882933d439b5720f0a145381179063722fbfc123/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:fb24ac194bfc5e86839d7dcd52092ee31e5fe6733fe11f5e3b06ef0812b20072", size = 45745118, upload-time = "2026-04-21T10:50:49.324Z" }, + { url = "https://files.pythonhosted.org/packages/13/d3/cca42fe166d1c6e4d5b80e530b7949104d10e17508a90ae202dac205ce2a/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9700ebd9a51f5895ce75ff4ac4b3c47a7d4b42bc618be8e713e5d56bacf5f931", size = 48844765, upload-time = "2026-04-21T10:50:55.579Z" }, + { url = "https://files.pythonhosted.org/packages/b0/49/942c3b79878ba928324d1e17c274ed84581db8c0a749b24bcf4cbdf15bd3/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d8ddd2768da81d3ee08cfea9b597f4abb4e8e1dc8ae7e204b608d23a0d3ab699", size = 49471890, upload-time = "2026-04-21T10:51:02.439Z" }, + { url = "https://files.pythonhosted.org/packages/76/97/ff71431000a75d84135a1ace5ca4ba11726a231a8007bbb320a4c54075d5/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:61a3d7eaa97a14768b542f3d284dc6400dd2470d9f080708b13cd46b6ae18136", size = 51932250, upload-time = "2026-04-21T10:51:10.576Z" }, + { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282, upload-time = "2026-04-21T10:51:16.815Z" }, +] + [[package]] name = "pygments" version = "2.20.0" From d125b19b374cf0ba303f04f3f0e245b978e4f5c4 Mon Sep 17 00:00:00 2001 From: Tim Condello Date: Wed, 22 Apr 2026 12:17:47 -0400 Subject: [PATCH 2/3] fix(bulk-import): remove sparse_indices from unexpected-column error message sparse_indices is a sub-field of the sparse_values struct, not a valid top-level column. Listing it as allowed in the error message contradicted the validation logic and would confuse users trying to fix their files. Co-Authored-By: Claude Sonnet 4.6 --- pinecone/db_data/resources/sync/bulk_import_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pinecone/db_data/resources/sync/bulk_import_validator.py b/pinecone/db_data/resources/sync/bulk_import_validator.py index 07f054738..27cc6cfbf 100644 --- a/pinecone/db_data/resources/sync/bulk_import_validator.py +++ b/pinecone/db_data/resources/sync/bulk_import_validator.py @@ -162,7 +162,7 @@ def _validate_schema( if extra: errors.append( f"Unexpected column(s) {sorted(extra)} — no additional columns are permitted. " - "Only 'id', 'values', 'sparse_values', 'sparse_indices', and 'metadata' are allowed." + "Only 'id', 'values', 'sparse_values', and 'metadata' are allowed." ) From 322b522468e4157977b521ad97761d58a44d4aab Mon Sep 17 00:00:00 2001 From: Tim Condello Date: Wed, 22 Apr 2026 13:45:06 -0400 Subject: [PATCH 3/3] fix(bulk-import): address PR review feedback and fix CI failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `continue` after metadata size error to prevent double-reporting the same row - Apply consistent error prefix (respects multi-file flag) to schema-read failures - Remove quoted return-type annotations on validate() — class is imported at module level - Add BulkImportValidationResult and validate_bulk_import to __init__.pyi __all__ - Use explicit re-export pattern (import X as X) in __init__.pyi to satisfy ruff F401 - Remove unused TYPE_CHECKING import of pyarrow.parquet in bulk_import_validator.py - Remove unused imports and variables in test_bulk_import_validator.py (ruff F841/F401) - Add mypy overrides for pyarrow optional dependency to silence import-not-found errors Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 69 ------------------- pinecone/__init__.pyi | 4 +- .../resources/asyncio/bulk_import_asyncio.py | 2 +- .../db_data/resources/sync/bulk_import.py | 2 +- .../resources/sync/bulk_import_validator.py | 5 +- pyproject.toml | 7 ++ tests/unit/data/test_bulk_import_validator.py | 6 -- 7 files changed, 15 insertions(+), 80 deletions(-) delete mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index f376d906a..000000000 --- a/CLAUDE.md +++ /dev/null @@ -1,69 +0,0 @@ -# CLAUDE.md - -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - -## Development Setup - -```sh -uv sync --extra grpc --extra asyncio # install all dependencies -uv run pre-commit install # enable lint/format checks on commit -``` - -## Key Commands - -```sh -make test-unit # run unit + grpc unit tests -uv run pytest tests/unit # REST unit tests only -uv run pytest tests/unit_grpc # gRPC unit tests only -uv run pytest tests/unit/path/to/test_file.py::ClassName::test_method # single test - -uv run mypy pinecone # type-check (excludes pinecone/core/) -uv run ruff check --fix # lint -uv run ruff format # format - -uv run repl # interactive REPL with pre-loaded Pinecone client - -make generate-oas # regenerate pinecone/core/openapi/ from OpenAPI specs -``` - -Integration tests make live Pinecone API calls and incur cost — only Pinecone employees should run them. Set credentials in `.env` (see `.env.example`) before running. - -## Architecture - -### Layer Overview - -``` -Pinecone / PineconeAsyncio ← public entry point (pinecone/pinecone.py, pinecone_asyncio.py) - ├── DBControl ← index/collection/backup management (pinecone/db_control/) - ├── DBData / Index ← vector upsert/query/fetch/delete (pinecone/db_data/) - └── Inference ← embedding and reranking models (pinecone/inference/) -``` - -`Pinecone` and `PineconeAsyncio` are thin facades. Each delegates to `DBControl` (control-plane operations) and returns `Index` / `IndexAsyncio` objects (data-plane operations). Inference is accessible via `pc.inference`. - -### Generated Code — Never Edit Manually - -`pinecone/core/openapi/` is fully generated from OpenAPI specs via `make generate-oas` (which runs `codegen/build-oas.sh`). The script calls the openapi-generator Docker image, applies several post-processing fixes (underscore field name normalization, datetime coercion removal, shared-class deduplication), then runs `ruff format`. **Do not hand-edit files in `pinecone/core/`.** - -Shared OpenAPI utilities (ApiClient, exceptions, model_utils, etc.) live in `pinecone/openapi_support/` rather than being duplicated across the five generated modules (`db_control`, `db_data`, `inference`, `oauth`, `admin`). - -### Adapter Layer - -`pinecone/adapters/` converts generated OpenAPI response objects into clean SDK dataclasses. This isolates the rest of the SDK from generated-model churn. When a new response type is needed, add it here rather than parsing OpenAPI objects in index.py or other business logic files. - -### Sync / Async Split - -Every stateful class has a sync and an async variant: -- `DBControl` / `DBControlAsyncio` -- `Index` (in `db_data/index.py`) / `IndexAsyncio` (in `db_data/index_asyncio.py`) -- `Inference` / `AsyncioInference` - -The async variants use `aiohttp` (optional extra). The sync variants use `urllib3`. gRPC is a third transport option installed via the `grpc` extra; data-plane integration tests can be toggled to gRPC with `USE_GRPC=true`. - -### Lazy Imports - -`pinecone/__init__.py` defers most imports through `utils/lazy_imports.py` to keep module startup time fast. When adding new public symbols, register them in the lazy import maps in `__init__.py` rather than adding top-level imports. The `.pyi` stub (`__init__.pyi`) is the authoritative type-visible public API surface and must be kept in sync. - -### Testing Philosophy - -Unit tests are intentionally sparse — they cover data conversion edge cases (e.g. `VectorFactory`, `QueryResultsAggregator`) but not every method. Most confidence comes from integration tests. When writing unit tests, check `tests/unit/db_data/` for patterns. Fixtures and index setup/teardown for integration tests live in `conftest.py` files at each directory level. diff --git a/pinecone/__init__.pyi b/pinecone/__init__.pyi index 10e3e2974..97fa534d2 100644 --- a/pinecone/__init__.pyi +++ b/pinecone/__init__.pyi @@ -50,7 +50,7 @@ from pinecone.db_data.models import ( ) from pinecone.core.openapi.db_data.models import NamespaceDescription from pinecone.db_data.resources.sync.bulk_import import ImportErrorMode -from pinecone.db_data.dataclasses.bulk_import_validation_result import BulkImportValidationResult +from pinecone.db_data.dataclasses.bulk_import_validation_result import BulkImportValidationResult as BulkImportValidationResult from pinecone.db_data.resources.sync.bulk_import_validator import validate_bulk_import_uri as validate_bulk_import from pinecone.db_data.errors import ( VectorDictionaryMissingKeysError, @@ -184,6 +184,8 @@ __all__ = [ "UpdateRequest", "NamespaceDescription", "ImportErrorMode", + "BulkImportValidationResult", + "validate_bulk_import", "FilterBuilder", # Error classes "VectorDictionaryMissingKeysError", diff --git a/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py b/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py index 31e5f9f96..817e0ff3b 100644 --- a/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py +++ b/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py @@ -162,7 +162,7 @@ def validate( vector_type: Literal["dense", "sparse"] | None = None, sample_rows: int = 100, verbose: bool = False, - ) -> "BulkImportValidationResult": + ) -> BulkImportValidationResult: """Validate parquet file(s) for Pinecone bulk import compatibility. This method is synchronous; pyarrow does not support async file I/O. diff --git a/pinecone/db_data/resources/sync/bulk_import.py b/pinecone/db_data/resources/sync/bulk_import.py index 8047fe9fe..483a3c542 100644 --- a/pinecone/db_data/resources/sync/bulk_import.py +++ b/pinecone/db_data/resources/sync/bulk_import.py @@ -169,7 +169,7 @@ def validate( vector_type: Literal["dense", "sparse"] | None = None, sample_rows: int = 100, verbose: bool = False, - ) -> "BulkImportValidationResult": + ) -> BulkImportValidationResult: """Validate parquet file(s) for Pinecone bulk import compatibility. Reads only the parquet file footer (schema metadata) by default, making diff --git a/pinecone/db_data/resources/sync/bulk_import_validator.py b/pinecone/db_data/resources/sync/bulk_import_validator.py index 27cc6cfbf..d16c1a0ec 100644 --- a/pinecone/db_data/resources/sync/bulk_import_validator.py +++ b/pinecone/db_data/resources/sync/bulk_import_validator.py @@ -10,7 +10,6 @@ if TYPE_CHECKING: import pyarrow as pa - import pyarrow.parquet as pq # Matches Pinecone's documented metadata size limit. _MAX_METADATA_BYTES = 40 * 1024 @@ -231,6 +230,7 @@ def _validate_data_sample( errors.append( f"Row {i}: metadata size {size} bytes exceeds the 40 KB limit" ) + continue try: obj = json.loads(raw) except json.JSONDecodeError as e: @@ -353,7 +353,8 @@ def validate_bulk_import_uri( schema = pq.read_schema(file_uri) except Exception as e: msg = f"failed to read parquet schema: {e}" - errors.append(f"{file_uri}: {msg}") + prefix = f"{file_uri}: " if multi else "" + errors.append(f"{prefix}{msg}") if verbose: print(f"[{index:>{len(str(total))}}/{total}] BAD {file_uri}") print(f" {msg}") diff --git a/pyproject.toml b/pyproject.toml index 06d3efae4..1cb278c03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -149,6 +149,13 @@ line-ending = "auto" docstring-code-format = false docstring-code-line-length = "dynamic" +[tool.mypy] +ignore_missing_imports = false + +[[tool.mypy.overrides]] +module = ["pyarrow", "pyarrow.*"] +ignore_missing_imports = true + [tool.ruff.lint.per-file-ignores] # F403 Allow star imports "__init__.py" = ["F403", "F405"] diff --git a/tests/unit/data/test_bulk_import_validator.py b/tests/unit/data/test_bulk_import_validator.py index 0634e15dd..cfd4250b3 100644 --- a/tests/unit/data/test_bulk_import_validator.py +++ b/tests/unit/data/test_bulk_import_validator.py @@ -5,7 +5,6 @@ """ import json -import math import pytest pytest.importorskip("pyarrow", reason="pyarrow required for bulk import validation") @@ -18,9 +17,6 @@ _validate_schema, _validate_data_sample, ) -from pinecone.db_data.dataclasses.bulk_import_validation_result import ( - BulkImportValidationResult, -) # --------------------------------------------------------------------------- @@ -86,7 +82,6 @@ def make_dense_table( null_vector: bool = False, non_finite: bool = False, ) -> pa.Table: - schema = make_schema({"id": pa.string(), "values": pa.list_(float_type)}) ids = [None if (null_id and i == 0) else ("" if (bad_id and i == 0) else f"vec-{i}") for i in range(n)] vectors = [] for i in range(n): @@ -297,7 +292,6 @@ def test_non_finite_value(self): assert any("finite" in e.lower() or "Inf" in e for e in errors) def test_nan_value(self): - schema = make_schema({"id": pa.string(), "values": pa.list_(pa.float32())}) table = pa.table( { "id": pa.array(["a", "b"], pa.string()),