Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pinecone/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@
"UpdateRequest": ("pinecone.db_data.models", "UpdateRequest"),
"NamespaceDescription": ("pinecone.core.openapi.db_data.models", "NamespaceDescription"),
"ImportErrorMode": ("pinecone.db_data.resources.sync.bulk_import", "ImportErrorMode"),
"BulkImportValidationResult": (
"pinecone.db_data.dataclasses.bulk_import_validation_result",
"BulkImportValidationResult",
),
"validate_bulk_import": (
"pinecone.db_data.resources.sync.bulk_import_validator",
"validate_bulk_import_uri",
),
"FilterBuilder": ("pinecone.db_data.filter_builder", "FilterBuilder"),
"VectorDictionaryMissingKeysError": (
"pinecone.db_data.errors",
Expand Down
4 changes: 4 additions & 0 deletions pinecone/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ from pinecone.db_data.models import (
)
from pinecone.core.openapi.db_data.models import NamespaceDescription
from pinecone.db_data.resources.sync.bulk_import import ImportErrorMode
from pinecone.db_data.dataclasses.bulk_import_validation_result import BulkImportValidationResult as BulkImportValidationResult
from pinecone.db_data.resources.sync.bulk_import_validator import validate_bulk_import_uri as validate_bulk_import
Comment thread
cursor[bot] marked this conversation as resolved.
from pinecone.db_data.errors import (
VectorDictionaryMissingKeysError,
VectorDictionaryExcessKeysError,
Expand Down Expand Up @@ -182,6 +184,8 @@ __all__ = [
"UpdateRequest",
"NamespaceDescription",
"ImportErrorMode",
"BulkImportValidationResult",
"validate_bulk_import",
"FilterBuilder",
# Error classes
"VectorDictionaryMissingKeysError",
Expand Down
2 changes: 2 additions & 0 deletions pinecone/db_data/dataclasses/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .query_response import QueryResponse
from .upsert_response import UpsertResponse
from .update_response import UpdateResponse
from .bulk_import_validation_result import BulkImportValidationResult

__all__ = [
"SparseValues",
Expand All @@ -21,4 +22,5 @@
"QueryResponse",
"UpsertResponse",
"UpdateResponse",
"BulkImportValidationResult",
]
42 changes: 42 additions & 0 deletions pinecone/db_data/dataclasses/bulk_import_validation_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from __future__ import annotations

from dataclasses import dataclass, field


@dataclass
class BulkImportValidationResult:
"""Result of a bulk import parquet validation check.

Attributes:
is_valid: True if no errors were found.
uri: The URI that was validated. Pass directly to ``index.bulk_import.start()``.
errors: Blocking issues that would cause the import to fail.
warnings: Non-blocking observations (e.g. detected dimension).
files_checked: Number of parquet files whose schema was inspected.
rows_sampled: Number of data rows checked (0 if schema-only validation).
"""

is_valid: bool
uri: str = ""
errors: list[str] = field(default_factory=list)
warnings: list[str] = field(default_factory=list)
files_checked: int = 0
rows_sampled: int = 0

def __repr__(self) -> str:
status = "VALID" if self.is_valid else "INVALID"
lines = [f"BulkImportValidationResult({status})"]
if self.uri:
lines.append(f" uri={self.uri!r}")
if self.errors:
lines.append(f" errors ({len(self.errors)}):")
for e in self.errors:
lines.append(f" - {e}")
if self.warnings:
lines.append(f" warnings ({len(self.warnings)}):")
for w in self.warnings:
lines.append(f" - {w}")
lines.append(
f" files_checked={self.files_checked}, rows_sampled={self.rows_sampled}"
)
return "\n".join(lines)
24 changes: 24 additions & 0 deletions pinecone/db_data/resources/asyncio/bulk_import_asyncio.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
)

from ..sync.bulk_import_request_factory import BulkImportRequestFactory
from ..sync.bulk_import_validator import validate_bulk_import_uri
from pinecone.db_data.dataclasses.bulk_import_validation_result import (
BulkImportValidationResult,
)

for m in [StartImportResponse, ListImportsResponse, ImportModel]:
install_json_repr_override(m)
Expand Down Expand Up @@ -150,3 +154,23 @@ async def cancel(self, id: str):
"""
args = BulkImportRequestFactory.cancel_import_args(id=id)
return await self.__import_operations_api.cancel_bulk_import(**args)

def validate(
self,
uri: str,
dimension: int | None = None,
vector_type: Literal["dense", "sparse"] | None = None,
sample_rows: int = 100,
verbose: bool = False,
) -> BulkImportValidationResult:
"""Validate parquet file(s) for Pinecone bulk import compatibility.

This method is synchronous; pyarrow does not support async file I/O.
For schema-only validation (no data download) pass ``sample_rows=0``.

See :meth:`pinecone.db_data.resources.sync.bulk_import.BulkImportResource.validate`
for full documentation.
"""
return validate_bulk_import_uri(
uri, dimension=dimension, vector_type=vector_type, sample_rows=sample_rows, verbose=verbose
)
55 changes: 55 additions & 0 deletions pinecone/db_data/resources/sync/bulk_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
)

from .bulk_import_request_factory import BulkImportRequestFactory, ImportErrorMode
from .bulk_import_validator import validate_bulk_import_uri
from pinecone.db_data.dataclasses.bulk_import_validation_result import (
BulkImportValidationResult,
)

for m in [StartImportResponse, ListImportsResponse, ImportModel]:
install_json_repr_override(m)
Expand Down Expand Up @@ -157,3 +161,54 @@ def cancel(self, id: str):
"""
args = BulkImportRequestFactory.cancel_import_args(id=id)
return self.__import_operations_api.cancel_bulk_import(**args)

def validate(
self,
uri: str,
dimension: int | None = None,
vector_type: Literal["dense", "sparse"] | None = None,
sample_rows: int = 100,
verbose: bool = False,
) -> BulkImportValidationResult:
"""Validate parquet file(s) for Pinecone bulk import compatibility.

Reads only the parquet file footer (schema metadata) by default, making
this fast even for large remote files. Pass ``sample_rows > 0`` (the
default) to also read a small number of rows and check for null IDs,
non-finite vector values, and metadata correctness.

Requires ``pyarrow``. Install with ``pip install 'pinecone[parquet]'``.
Remote URIs (``s3://``, ``gs://``, ``az://``) work automatically when
the appropriate filesystem library is available in your environment
(``pyarrow`` includes built-in S3 support).

Args:
uri: Local path or remote URI. May point to a single ``.parquet``
file or a directory/prefix containing multiple files.
dimension: Expected vector dimension. A mismatch is reported as an
error. When omitted, dimension is inferred from the schema if
the file uses a ``fixed_size_list`` type.
vector_type: ``"dense"`` or ``"sparse"``. Inferred from column
names when omitted.
sample_rows: Rows to read for data-level checks. Set to ``0`` for
schema-only validation (no data download).

Returns:
:class:`~pinecone.BulkImportValidationResult`

Examples:
>>> result = index.bulk_import.validate("s3://my-bucket/vectors/")
>>> if not result.is_valid:
... for error in result.errors:
... print(error)

>>> # Schema-only check — reads only the parquet footer
>>> result = index.bulk_import.validate(
... "s3://my-bucket/vectors/",
... dimension=1024,
... sample_rows=0,
... )
"""
return validate_bulk_import_uri(
uri, dimension=dimension, vector_type=vector_type, sample_rows=sample_rows, verbose=verbose
)
Loading
Loading