diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 24705f5..78e880d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -192,7 +192,7 @@ jobs: MAX_ATTEMPTS=30 SLEEP_SECONDS=10 - echo "Attempting to install es2==${WHEEL_VERSION} from TestPyPI..." + echo "Attempting to install pyenvector==${WHEEL_VERSION} from TestPyPI..." ATTEMPTS=0 while true; do ATTEMPTS=$((ATTEMPTS + 1)) diff --git a/.gitignore b/.gitignore index eca98cc..cfda40f 100644 --- a/.gitignore +++ b/.gitignore @@ -39,10 +39,8 @@ keys/ VECTORSTORE.md # External symlinks (local workspace references) -es2-msa es2-msa/ -es2-deploy -es2-deploy/ +envector-deployment/ # Local helper scripts run_unit_tests.py diff --git a/CONTRIBUTE.md b/CONTRIBUTE.md index a2e8f13..6007c29 100644 --- a/CONTRIBUTE.md +++ b/CONTRIBUTE.md @@ -13,16 +13,16 @@ Thanks for your interest in improving the project! This guide covers local setup ## Testing - **Unit tests** (fakes only): `python run_unit_tests.py` -- **Integration tests** (requires ES2 server + keys): - - Export `ES2_ADDRESS`, `ES2_KEY_PATH`, `ES2_KEY_ID` - - Optional: `ES2_USE_EMBEDDINGS=1`, `ES2_EMB_MODEL`, `ES2_USE_HF_DATASET=1` +- **Integration tests** (requires EnVector server + keys): + - Export `ENVECTOR_ADDRESS`, `ENVECTOR_KEY_PATH`, `ENVECTOR_KEY_ID` + - Optional: `ENVECTOR_USE_EMBEDDINGS=1`, `ENVECTOR_EMB_MODEL`, `ENVECTOR_USE_HF_DATASET=1` - Run `pytest -m integration -s` Please run relevant tests before submitting a PR and mention coverage in the description. ## Development Guidelines - Keep code, comments, and docs in English. -- Prefer the high-level `es2` SDK APIs; avoid direct gRPC/indexer calls unless required. +- Prefer the high-level `pyenvector` SDK APIs; avoid direct gRPC/indexer calls unless required. - Keep changes focused and documented; update README or notebooks when behavior changes. - Follow existing formatting and type-hint conventions. diff --git a/README.md b/README.md index ac6fea8..393e18d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # LangChain Envector Integration -Encrypted vector search for LangChain using Envector (ES2), powered by homomorphic encryption (CKKS). This repo ships a LangChain-compatible VectorStore and retriever utilities built on the high-level `es2` Python SDK. +Encrypted vector search for LangChain using Envector, powered by homomorphic encryption (CKKS). This repo ships a LangChain-compatible VectorStore and retriever utilities built on the high-level `pyenvector` Python SDK. ## Features - LangChain `VectorStore` interface with `similarity_search`, `from_texts`, etc. @@ -13,10 +13,10 @@ Encrypted vector search for LangChain using Envector (ES2), powered by homomorph - `python3.11 -m venv .venv && source .venv/bin/activate` - Install runtime dependencies: - `pip install -U pip setuptools wheel` - - `pip install es2 langchain sentence-transformers` + - `pip install pyenvector langchain sentence-transformers` ## Usage Overview -1. Configure Envector using `EnvectorConfig`, pointing to your ES2 endpoint and keys. +1. Configure Envector using `EnvectorConfig`, pointing to your EnVector endpoint and keys. 2. Initialize embeddings (or provide pre-computed vectors). 3. Instantiate `Envector(config=cfg, embeddings=emb)` and call `add_texts`, `add_documents`, or use `as_retriever`. 4. Run `similarity_search` or plug the retriever into your LangChain pipeline. @@ -25,13 +25,13 @@ Encrypted vector search for LangChain using Envector (ES2), powered by homomorph ## Configuration Key dataclasses live in `libs/envector/config.py`: -- `ConnectionConfig`: address or host/port for ES2. +- `ConnectionConfig`: address or host/port for EnVector. - `KeyConfig`: key path, key ID, optional preset/eval mode. - `IndexSettings`: index name, dimension (32–4096), query encryption mode, optional output fields and fetch parameters. - `EnvectorConfig`: wraps the above and enables auto-creation via `create_if_missing`. ## Data Model -- Each vector stores a single `metadata` string in ES2. +- Each vector stores a single `metadata` string in EnVector. - To align with LangChain’s `Document`, inserts wrap data as JSON: `{"text": ..., "metadata": ...}`. - Retrieval unwraps JSON, returning `Document(page_content=text, metadata={...})`. - Client-side filtering requires the JSON envelope to include an object under `metadata`. @@ -48,12 +48,12 @@ Key dataclasses live in `libs/envector/config.py`: cfg = EnvectorConfig( connection=ConnectionConfig( - address=ES2_ADDRESS, - access_token=ES2_ACCESS_TOKEN + address=ENVECTOR_ADDRESS, + access_token=ENVECTOR_ACCESS_TOKEN ), key=KeyConfig( - key_path=ES2_KEY_PATH, - key_id=ES2_KEY_ID, + key_path=ENVECTOR_KEY_PATH, + key_id=ENVECTOR_KEY_ID, preset="ip", eval_mode="rmp" ), @@ -100,18 +100,18 @@ Key dataclasses live in `libs/envector/config.py`: The methods `similarity_search` and `similarity_search_with_vector` (with `embeddings.embed_query()`) are also available to perform vector search. ## Troubleshooting -- Connection issues: verify ES2 address and registered keys. +- Connection issues: verify EnVector address and registered keys. - Embeddings mismatch: ensure embedding dimension equals `index.dim` when supplying vectors. - Unexpected raw strings: confirm inserts used the JSON envelope. - Key Issues: check key's metadata to sync with the registered key if facing any key issue. -## Testing Without ES2 -- Run unit tests offline (no ES2 or SDK required): +## Testing Without EnVector +- Run unit tests offline (no EnVector or SDK required): - `python -m pytest -q -m "not integration"` - or `python scripts/run_unit_tests.py` - Run integration tests (requires server and keys): - - Export `ES2_ADDRESS`, `ES2_KEY_PATH`, `ES2_KEY_ID` - - Optional: `ES2_USE_EMBEDDINGS=1`, `ES2_EMB_MODEL`, `ES2_USE_HF_DATASET=1` + - Export `ENVECTOR_ADDRESS`, `ENVECTOR_KEY_PATH`, `ENVECTOR_KEY_ID` + - Optional: `ENVECTOR_USE_EMBEDDINGS=1`, `ENVECTOR_EMB_MODEL`, `ENVECTOR_USE_HF_DATASET=1` - `python -m pytest -q -m integration -s` ## Contributing diff --git a/libs/envector/README.md b/libs/envector/README.md index 0ff389d..a9323f2 100644 --- a/libs/envector/README.md +++ b/libs/envector/README.md @@ -1,15 +1,15 @@ # Envector (LangChain VectorStore) -High-level VectorStore adaptor for Envector (ES2), using the `es2` SDK. Vectors are always encrypted on the server; the SDK performs required crypto client-side. +High-level VectorStore adaptor for Envector, using the `pyenvector` SDK. Vectors are always encrypted on the server; the SDK performs required crypto client-side. Key points -- Use high-level `es2.ES2` and `es2.Index`; avoid low-level `es2.api.Indexer`/gRPC. +- Use high-level `pyenvector.EnvectorClient` and `pyenvector.Index`; avoid low-level `pyenvector.api.Indexer`/gRPC. - Index encryption is fixed to `cipher`. Query can be `plain` or `cipher`. - Metadata is stored as a single JSON string per item: `{id, text, metadata}`. Files - `config.py`: Configuration dataclasses (connection, key, index). -- `client.py`: Initializes ES2 + index and returns an `Index` instance. +- `client.py`: Initializes EnVector + index and returns an `Index` instance. - `vectorstore.py`: `Envector` VectorStore implementation. - `retriever.py`: Optional wrapper retriever. - `examples/`: Minimal examples. diff --git a/libs/envector/examples/basic_usage.py b/libs/envector/examples/basic_usage.py index d70b61b..88eeb47 100644 --- a/libs/envector/examples/basic_usage.py +++ b/libs/envector/examples/basic_usage.py @@ -1,7 +1,7 @@ """Basic usage example for Envector VectorStore. Requirements: -- `es2` +- `pyenvector` - `langchain` (version providing VectorStore APIs) - An embeddings backend, e.g. sentence-transformers """ diff --git a/libs/envector/examples/ingest_synthetic_1k.py b/libs/envector/examples/ingest_synthetic_1k.py index 8b055c8..ec92a7d 100644 --- a/libs/envector/examples/ingest_synthetic_1k.py +++ b/libs/envector/examples/ingest_synthetic_1k.py @@ -1,7 +1,7 @@ """Ingest the synthetic 1K dataset into Envector. Requires: -- ES2 server and keys. +- EnVector server and keys. - Dataset at `data/synthetic_rag_1k.jsonl` (run scripts/make_synthetic_rag_dataset.py). Usage: diff --git a/libs/envector/langchain_envector/__init__.py b/libs/envector/langchain_envector/__init__.py index cf7a9b6..0567c2e 100644 --- a/libs/envector/langchain_envector/__init__.py +++ b/libs/envector/langchain_envector/__init__.py @@ -1,6 +1,6 @@ """Envector LangChain integration package. -Provides a LangChain-compatible VectorStore that wraps the high-level `es2` SDK. +Provides a LangChain-compatible VectorStore that wraps the high-level `pyenvector` SDK. All code and comments are in English as per project rules. """ diff --git a/libs/envector/langchain_envector/client.py b/libs/envector/langchain_envector/client.py index c3ac2f9..930913d 100644 --- a/libs/envector/langchain_envector/client.py +++ b/libs/envector/langchain_envector/client.py @@ -4,45 +4,45 @@ class EnvectorClient: - """Thin convenience client around the high-level `es2` SDK. + """Thin convenience client around the high-level `pyenvector` SDK. - Establishes a connection - Initializes key and index configuration - Optionally creates the index if missing - - Provides access to the ES2 `Index` instance + - Provides access to the envector `Index` instance """ def __init__(self, config: EnvectorConfig): self.config = config - self._es2 = None + self._ev = None self._index = None def init(self): - import es2 + import pyenvector as ev c = self.config.connection k = self.config.key i = self.config.index - es2_client = es2.ES2() + ev_client = ev.EnvectorClient() # Connection if c.address: - es2_client.init_connect(address=c.address, access_token=c.access_token) + ev_client.init_connect(address=c.address, access_token=c.access_token) else: if not (c.host and c.port): raise ValueError("Either address or host+port must be provided.") - es2_client.init_connect( + ev_client.init_connect( host=c.host, port=c.port, access_token=c.access_token ) # Key path baseline for Index - from es2.index import Index as _Index + from pyenvector.index import Index as _Index _Index.init_key_path(k.key_path) # Index config + key setup - es2_client.init_index_config( + ev_client.init_index_config( index_name=i.index_name, dim=i.dim, key_path=k.key_path, @@ -59,13 +59,13 @@ def init(self): # Create index if missing if self.config.create_if_missing: - idx_list = es2_client.get_index_list() + idx_list = ev_client.get_index_list() if i.index_name not in idx_list: - es2_client.create_index(index_name=i.index_name, dim=i.dim) + ev_client.create_index(index_name=i.index_name, dim=i.dim) # Bind index instance - self._index = es2.Index(i.index_name) - self._es2 = es2_client + self._index = ev.Index(i.index_name) + self._ev = ev_client return self @property @@ -75,7 +75,7 @@ def index(self): return self._index @property - def es2(self): - if self._es2 is None: + def ev(self): + if self._ev is None: raise RuntimeError("Client not initialized. Call init().") - return self._es2 + return self._ev diff --git a/libs/envector/langchain_envector/types.py b/libs/envector/langchain_envector/types.py index 286a24c..99d4a73 100644 --- a/libs/envector/langchain_envector/types.py +++ b/libs/envector/langchain_envector/types.py @@ -29,9 +29,9 @@ class SearchResult: def pack_metadata(text: str, metadata: Optional[Dict[str, Any]] = None) -> str: - """Pack text and metadata into a single JSON string field accepted by ES2. + """Pack text and metadata into a single JSON string field accepted by pyenvector. - ES2 metadata API stores lists of strings; we store a single JSON blob per item. + pyenvector metadata API stores lists of strings; we store a single JSON blob per item. Item-level IDs are not persisted/addressable. """ import json @@ -46,7 +46,7 @@ def pack_metadata(text: str, metadata: Optional[Dict[str, Any]] = None) -> str: def unpack_metadata(raw: Any) -> Dict[str, Any]: """Return metadata as a dict regardless of the raw payload type. - Recent ES2 versions may return decrypted metadata as a Python dict instead + Recent pyenvector versions may return decrypted metadata as a Python dict instead of the JSON string we originally stored. We normalise the payload here so downstream code always works with a dictionary. """ @@ -79,7 +79,7 @@ def unpack_metadata(raw: Any) -> Dict[str, Any]: if isinstance(data, dict): return data except Exception: - # Some ES2 responses return Python-literal strings (single quotes). + # Some pyenvector responses return Python-literal strings (single quotes). try: import ast diff --git a/libs/envector/langchain_envector/vectorstore.py b/libs/envector/langchain_envector/vectorstore.py index eef1cb5..67bccbf 100644 --- a/libs/envector/langchain_envector/vectorstore.py +++ b/libs/envector/langchain_envector/vectorstore.py @@ -38,10 +38,10 @@ def __init__( class Envector(VectorStore): # type: ignore[misc] - """LangChain-compatible VectorStore adaptor for Envector (ES2). + """LangChain-compatible VectorStore adaptor for Envector. - This class wraps the high-level `es2` SDK. It does not use low-level - gRPC stubs or `es2.api.Indexer` directly. + This class wraps the high-level `pyenvector` SDK. It does not use low-level + gRPC stubs or `pyenvector.api.Indexer` directly. """ def __init__( @@ -89,7 +89,7 @@ def add_texts( # Prepare metadata JSON strings per item packed = [pack_metadata(t, m) for t, m in zip(texts, metadatas)] - # Insert using high-level ES2 Index + # Insert using high-level pyenvector Index result_ids = self.client.index.insert(data=vectors, metadata=packed) # Return ephemeral placeholders to satisfy VectorStore interface, @@ -111,7 +111,7 @@ def _similarity_search_with_scores( results = self.client.index.search( query=embedding, top_k=top_k, output_fields=self.config.index.output_fields ) - # ES2 Index.search returns a list for each query; we passed single query + # pyenvector Index.search returns a list for each query; we passed single query result = ( results[0] if isinstance(results, list) and results and isinstance(results[0], list) @@ -265,7 +265,7 @@ def add_documents( extracting `page_content` and `metadata` from each Document. Notes: - - Manual `ids` are ignored (ES2 does not support user-provided IDs). + - Manual `ids` are ignored (EnVector does not support user-provided IDs). - When `embeddings` is not configured, you must supply `vectors`. - Returns ephemeral IDs as produced by the client insert. """ diff --git a/pyproject.toml b/pyproject.toml index 2e21520..acb83b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,8 +7,8 @@ build-backend = "setuptools.build_meta" [project] name = "langchain-envector" -version = "0.1.2" -description = "LangChain VectorStore integration for Envector (ES2) encrypted vector search" +version = "0.1.3" +description = "LangChain VectorStore integration for Envector" readme = "README.md" license = {text = "MIT"} requires-python = ">=3.9,<3.14" @@ -16,10 +16,10 @@ authors = [ { name = "Envector Contributors" } ] dependencies = [ - "es2", + "pyenvector", "langchain>=0.2.0", ] -keywords = ["langchain", "vectorstore", "homomorphic-encryption", "ckks", "encrypted-search", "envector", "es2"] +keywords = ["langchain", "vectorstore", "homomorphic-encryption", "ckks", "encrypted-search", "envector", "pyenvector"] classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", diff --git a/pytest.ini b/pytest.ini index 6d12c43..70b79b9 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,6 +1,6 @@ [pytest] markers = - integration: tests that require a running ES2 server and the real es2 SDK + integration: tests that require a running EnVector server and the real EnVector SDK testpaths = tests diff --git a/tests/integration/test_es2_integration.py b/tests/integration/test_es2_integration.py index 8f5fca1..a7272fd 100644 --- a/tests/integration/test_es2_integration.py +++ b/tests/integration/test_es2_integration.py @@ -25,35 +25,30 @@ def _require_env(name: str) -> str: @pytest.mark.skipif( - os.environ.get("ES2_ADDRESS") is None, - reason="Set ES2_ADDRESS (e.g., 0.0.0.0:50050) to enable ES2 integration tests", + os.environ.get("ENVECTOR_ADDRESS") is None, + reason="Set ENVECTOR_ADDRESS (e.g., 0.0.0.0:50050) to enable Envector integration tests", ) def test_e2e_vectorstore_plain_and_cipher(): - try: - import es2 # type: ignore - except Exception as e: # pragma: no cover - env-dependent - pytest.skip(f"es2 SDK not available: {e}") - - address = _require_env("ES2_ADDRESS") - key_path = _require_env("ES2_KEY_PATH") - key_id = _require_env("ES2_KEY_ID") - use_emb = os.environ.get("ES2_USE_EMBEDDINGS") in {"1", "true", "TRUE", "yes"} + address = _require_env("ENVECTOR_ADDRESS") + key_path = _require_env("ENVECTOR_KEY_PATH") + key_id = _require_env("ENVECTOR_KEY_ID") + use_emb = os.environ.get("ENVECTOR_USE_EMBEDDINGS") in {"1", "true", "TRUE", "yes"} model_name = os.environ.get( - "ES2_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2" + "ENVECTOR_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2" ) - use_hf = os.environ.get("ES2_USE_HF_DATASET") in {"1", "true", "TRUE", "yes"} - hf_name = os.environ.get("ES2_HF_NAME", "ag_news") - hf_subset = os.environ.get("ES2_HF_SUBSET") - hf_split = os.environ.get("ES2_HF_SPLIT", "train") - hf_text_col = os.environ.get("ES2_HF_TEXT_COL", "text") + use_hf = os.environ.get("ENVECTOR_USE_HF_DATASET") in {"1", "true", "TRUE", "yes"} + hf_name = os.environ.get("ENVECTOR_HF_NAME", "ag_news") + hf_subset = os.environ.get("ENVECTOR_HF_SUBSET") + hf_split = os.environ.get("ENVECTOR_HF_SPLIT", "train") + hf_text_col = os.environ.get("ENVECTOR_HF_TEXT_COL", "text") hf_meta_cols = [ - c for c in os.environ.get("ES2_HF_META_COLS", "label").split(",") if c + c for c in os.environ.get("ENVECTOR_HF_META_COLS", "label").split(",") if c ] - hf_size = int(os.environ.get("ES2_HF_SIZE", "200")) - hf_seed = int(os.environ.get("ES2_HF_SEED", "42")) + hf_size = int(os.environ.get("ENVECTOR_HF_SIZE", "200")) + hf_seed = int(os.environ.get("ENVECTOR_HF_SEED", "42")) # Determine dimension: either from env, or from embeddings model, or default - dim_env = os.environ.get("ES2_DIM") + dim_env = os.environ.get("ENVECTOR_DIM") if use_emb: emb = None # Prefer LangChain embeddings if available, else fall back to sentence-transformers @@ -79,13 +74,13 @@ def test_e2e_vectorstore_plain_and_cipher(): pytest.skip("Envector supports dimensions in [32, 4096]") base_index_name = os.environ.get( - "ES2_INDEX_NAME", f"inttest_{secrets.token_hex(4)}" + "ENVECTOR_INDEX_NAME", f"inttest_{secrets.token_hex(4)}" ) - import es2 + import pyenvector as ev - es2.init_connect(address=address) - es2.reset() + ev.init_connect(address=address) + ev.reset() # Plain query mode cfg_plain = EnvectorConfig( @@ -205,8 +200,8 @@ def test_e2e_vectorstore_plain_and_cipher(): assert all("_id" in d.metadata for d in docs_cc) # Cleanup - store_plain.client.es2.init_connect(address=address) - store_plain.client.es2.drop_index(cfg_plain.index.index_name) + store_plain.client.ev.init_connect(address=address) + store_plain.client.ev.drop_index(cfg_plain.index.index_name) - store_cc.client.es2.init_connect(address=address) - store_cc.client.es2.drop_index(cfg_cc.index.index_name) + store_cc.client.ev.init_connect(address=address) + store_cc.client.ev.drop_index(cfg_cc.index.index_name) diff --git a/tests/requirements.txt b/tests/requirements.txt index d51cc17..02b0ade 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,9 +3,9 @@ # Test runner pytest -# ES2 SDK (encrypted vector search) — install from local wheel at repo root +# pyenvector SDK — install from local wheel at repo root # Use a direct wheel path (no PEP 508 direct reference) for maximum pip compatibility. -./es2-1.0.3rc7-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +./pyenvector-1.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl # Note: LangChain is optional for tests. Integration tests will fall back to # sentence-transformers if LangChain embeddings are unavailable.