pinecone-io · tim-pinecone · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,69 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Development Setup
+
+```sh
+uv sync --extra grpc --extra asyncio   # install all dependencies
+uv run pre-commit install               # enable lint/format checks on commit
+```
+
+## Key Commands
+
+```sh
+make test-unit                          # run unit + grpc unit tests
+uv run pytest tests/unit                # REST unit tests only
+uv run pytest tests/unit_grpc           # gRPC unit tests only
+uv run pytest tests/unit/path/to/test_file.py::ClassName::test_method  # single test
+
+uv run mypy pinecone                    # type-check (excludes pinecone/core/)
+uv run ruff check --fix                 # lint
+uv run ruff format                      # format
+
+uv run repl                             # interactive REPL with pre-loaded Pinecone client
+
+make generate-oas                       # regenerate pinecone/core/openapi/ from OpenAPI specs
+```
+
+Integration tests make live Pinecone API calls and incur cost — only Pinecone employees should run them. Set credentials in `.env` (see `.env.example`) before running.
+
+## Architecture
+
+### Layer Overview
+
+```
+Pinecone / PineconeAsyncio   ← public entry point (pinecone/pinecone.py, pinecone_asyncio.py)
+    ├── DBControl             ← index/collection/backup management (pinecone/db_control/)
+    ├── DBData / Index        ← vector upsert/query/fetch/delete (pinecone/db_data/)
+    └── Inference             ← embedding and reranking models (pinecone/inference/)
+```
+
+`Pinecone` and `PineconeAsyncio` are thin facades. Each delegates to `DBControl` (control-plane operations) and returns `Index` / `IndexAsyncio` objects (data-plane operations). Inference is accessible via `pc.inference`.
+
+### Generated Code — Never Edit Manually
+
+`pinecone/core/openapi/` is fully generated from OpenAPI specs via `make generate-oas` (which runs `codegen/build-oas.sh`). The script calls the openapi-generator Docker image, applies several post-processing fixes (underscore field name normalization, datetime coercion removal, shared-class deduplication), then runs `ruff format`. **Do not hand-edit files in `pinecone/core/`.**
+
+Shared OpenAPI utilities (ApiClient, exceptions, model_utils, etc.) live in `pinecone/openapi_support/` rather than being duplicated across the five generated modules (`db_control`, `db_data`, `inference`, `oauth`, `admin`).
+
+### Adapter Layer
+
+`pinecone/adapters/` converts generated OpenAPI response objects into clean SDK dataclasses. This isolates the rest of the SDK from generated-model churn. When a new response type is needed, add it here rather than parsing OpenAPI objects in index.py or other business logic files.
+
+### Sync / Async Split
+
+Every stateful class has a sync and an async variant:
+- `DBControl` / `DBControlAsyncio`
+- `Index` (in `db_data/index.py`) / `IndexAsyncio` (in `db_data/index_asyncio.py`)
+- `Inference` / `AsyncioInference`
+
+The async variants use `aiohttp` (optional extra). The sync variants use `urllib3`. gRPC is a third transport option installed via the `grpc` extra; data-plane integration tests can be toggled to gRPC with `USE_GRPC=true`.
+
+### Lazy Imports
+
+`pinecone/__init__.py` defers most imports through `utils/lazy_imports.py` to keep module startup time fast. When adding new public symbols, register them in the lazy import maps in `__init__.py` rather than adding top-level imports. The `.pyi` stub (`__init__.pyi`) is the authoritative type-visible public API surface and must be kept in sync.
+
+### Testing Philosophy
+
+Unit tests are intentionally sparse — they cover data conversion edge cases (e.g. `VectorFactory`, `QueryResultsAggregator`) but not every method. Most confidence comes from integration tests. When writing unit tests, check `tests/unit/db_data/` for patterns. Fixtures and index setup/teardown for integration tests live in `conftest.py` files at each directory level.
diff --git a/pinecone/__init__.py b/pinecone/__init__.py
@@ -60,6 +60,14 @@
     "UpdateRequest": ("pinecone.db_data.models", "UpdateRequest"),
     "NamespaceDescription": ("pinecone.core.openapi.db_data.models", "NamespaceDescription"),
     "ImportErrorMode": ("pinecone.db_data.resources.sync.bulk_import", "ImportErrorMode"),
+    "BulkImportValidationResult": (
+        "pinecone.db_data.dataclasses.bulk_import_validation_result",
+        "BulkImportValidationResult",
+    ),
+    "validate_bulk_import": (
+        "pinecone.db_data.resources.sync.bulk_import_validator",
+        "validate_bulk_import_uri",
+    ),
     "FilterBuilder": ("pinecone.db_data.filter_builder", "FilterBuilder"),
     "VectorDictionaryMissingKeysError": (
         "pinecone.db_data.errors",

diff --git a/pinecone/__init__.pyi b/pinecone/__init__.pyi
@@ -50,6 +50,8 @@ from pinecone.db_data.models import (
 )
 from pinecone.core.openapi.db_data.models import NamespaceDescription
 from pinecone.db_data.resources.sync.bulk_import import ImportErrorMode
+from pinecone.db_data.dataclasses.bulk_import_validation_result import BulkImportValidationResult
+from pinecone.db_data.resources.sync.bulk_import_validator import validate_bulk_import_uri as validate_bulk_import
 from pinecone.db_data.errors import (
     VectorDictionaryMissingKeysError,
     VectorDictionaryExcessKeysError,

diff --git a/pinecone/db_data/dataclasses/__init__.py b/pinecone/db_data/dataclasses/__init__.py
@@ -8,6 +8,7 @@
 from .query_response import QueryResponse
 from .upsert_response import UpsertResponse
 from .update_response import UpdateResponse
+from .bulk_import_validation_result import BulkImportValidationResult
 
 __all__ = [
     "SparseValues",
@@ -21,4 +22,5 @@
     "QueryResponse",
     "UpsertResponse",
     "UpdateResponse",
+    "BulkImportValidationResult",
 ]
diff --git a/pinecone/db_data/dataclasses/bulk_import_validation_result.py b/pinecone/db_data/dataclasses/bulk_import_validation_result.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class BulkImportValidationResult:
+    """Result of a bulk import parquet validation check.
+
+    Attributes:
+        is_valid: True if no errors were found.
+        uri: The URI that was validated. Pass directly to ``index.bulk_import.start()``.
+        errors: Blocking issues that would cause the import to fail.
+        warnings: Non-blocking observations (e.g. detected dimension).
+        files_checked: Number of parquet files whose schema was inspected.
+        rows_sampled: Number of data rows checked (0 if schema-only validation).
+    """
+
+    is_valid: bool
+    uri: str = ""
+    errors: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+    files_checked: int = 0
+    rows_sampled: int = 0
+
+    def __repr__(self) -> str:
+        status = "VALID" if self.is_valid else "INVALID"
+        lines = [f"BulkImportValidationResult({status})"]
+        if self.uri:
+            lines.append(f"  uri={self.uri!r}")
+        if self.errors:
+            lines.append(f"  errors ({len(self.errors)}):")
+            for e in self.errors:
+                lines.append(f"    - {e}")
+        if self.warnings:
+            lines.append(f"  warnings ({len(self.warnings)}):")
+            for w in self.warnings:
+                lines.append(f"    - {w}")
+        lines.append(
+            f"  files_checked={self.files_checked}, rows_sampled={self.rows_sampled}"
+        )
+        return "\n".join(lines)
diff --git a/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py b/pinecone/db_data/resources/asyncio/bulk_import_asyncio.py
@@ -11,6 +11,10 @@
 )
 
 from ..sync.bulk_import_request_factory import BulkImportRequestFactory
+from ..sync.bulk_import_validator import validate_bulk_import_uri
+from pinecone.db_data.dataclasses.bulk_import_validation_result import (
+    BulkImportValidationResult,
+)
 
 for m in [StartImportResponse, ListImportsResponse, ImportModel]:
     install_json_repr_override(m)
@@ -150,3 +154,23 @@ async def cancel(self, id: str):
         """
         args = BulkImportRequestFactory.cancel_import_args(id=id)
         return await self.__import_operations_api.cancel_bulk_import(**args)
+
+    def validate(
+        self,
+        uri: str,
+        dimension: int | None = None,
+        vector_type: Literal["dense", "sparse"] | None = None,
+        sample_rows: int = 100,
+        verbose: bool = False,
+    ) -> "BulkImportValidationResult":
+        """Validate parquet file(s) for Pinecone bulk import compatibility.
+
+        This method is synchronous; pyarrow does not support async file I/O.
+        For schema-only validation (no data download) pass ``sample_rows=0``.
+
+        See :meth:`pinecone.db_data.resources.sync.bulk_import.BulkImportResource.validate`
+        for full documentation.
+        """
+        return validate_bulk_import_uri(
+            uri, dimension=dimension, vector_type=vector_type, sample_rows=sample_rows, verbose=verbose
+        )
diff --git a/pinecone/db_data/resources/sync/bulk_import.py b/pinecone/db_data/resources/sync/bulk_import.py
@@ -11,6 +11,10 @@
 )
 
 from .bulk_import_request_factory import BulkImportRequestFactory, ImportErrorMode
+from .bulk_import_validator import validate_bulk_import_uri
+from pinecone.db_data.dataclasses.bulk_import_validation_result import (
+    BulkImportValidationResult,
+)
 
 for m in [StartImportResponse, ListImportsResponse, ImportModel]:
     install_json_repr_override(m)
@@ -157,3 +161,54 @@ def cancel(self, id: str):
         """
         args = BulkImportRequestFactory.cancel_import_args(id=id)
         return self.__import_operations_api.cancel_bulk_import(**args)
+
+    def validate(
+        self,
+        uri: str,
+        dimension: int | None = None,
+        vector_type: Literal["dense", "sparse"] | None = None,
+        sample_rows: int = 100,
+        verbose: bool = False,
+    ) -> "BulkImportValidationResult":
+        """Validate parquet file(s) for Pinecone bulk import compatibility.
+
+        Reads only the parquet file footer (schema metadata) by default, making
+        this fast even for large remote files. Pass ``sample_rows > 0`` (the
+        default) to also read a small number of rows and check for null IDs,
+        non-finite vector values, and metadata correctness.
+
+        Requires ``pyarrow``. Install with ``pip install 'pinecone[parquet]'``.
+        Remote URIs (``s3://``, ``gs://``, ``az://``) work automatically when
+        the appropriate filesystem library is available in your environment
+        (``pyarrow`` includes built-in S3 support).
+
+        Args:
+            uri: Local path or remote URI. May point to a single ``.parquet``
+                file or a directory/prefix containing multiple files.
+            dimension: Expected vector dimension. A mismatch is reported as an
+                error. When omitted, dimension is inferred from the schema if
+                the file uses a ``fixed_size_list`` type.
+            vector_type: ``"dense"`` or ``"sparse"``. Inferred from column
+                names when omitted.
+            sample_rows: Rows to read for data-level checks. Set to ``0`` for
+                schema-only validation (no data download).
+
+        Returns:
+            :class:`~pinecone.BulkImportValidationResult`
+
+        Examples:
+            >>> result = index.bulk_import.validate("s3://my-bucket/vectors/")
+            >>> if not result.is_valid:
+            ...     for error in result.errors:
+            ...         print(error)
+
+            >>> # Schema-only check — reads only the parquet footer
+            >>> result = index.bulk_import.validate(
+            ...     "s3://my-bucket/vectors/",
+            ...     dimension=1024,
+            ...     sample_rows=0,
+            ... )
+        """
+        return validate_bulk_import_uri(
+            uri, dimension=dimension, vector_type=vector_type, sample_rows=sample_rows, verbose=verbose
+        )