From c4418ecc8d6b8f03e995ee28962b956e2a10499a Mon Sep 17 00:00:00 2001 From: inkme Date: Sun, 12 Oct 2025 22:13:34 +0900 Subject: [PATCH 01/13] ES2-975: Implement add_documents in VectorStore, add unit tests, offline test runner, and doc updates (README, VECTORSTORE.md) --- README.md | 22 ++++- VECTORSTORE.md | 74 +++++++++++++++ es2-msa | 1 + libs/envector/langchain_envector/client.py | 1 - .../langchain_envector/vectorstore.py | 22 +++++ run_unit_tests.py | 23 +++++ tests/test_vectorstore.py | 93 ++++++++++++++++++- 7 files changed, 233 insertions(+), 3 deletions(-) create mode 100644 VECTORSTORE.md create mode 120000 es2-msa create mode 100644 run_unit_tests.py diff --git a/README.md b/README.md index 5662696..7a231ae 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Encrypted vector search for LangChain using Envector (ES2), powered by homomorph ## Usage Overview 1. Configure Envector using `EnvectorConfig`, pointing to your ES2 endpoint and keys. 2. Initialize embeddings (or provide pre-computed vectors). -3. Instantiate `Envector(config=cfg, embeddings=emb)` and call `add_texts` or `as_retriever`. +3. Instantiate `Envector(config=cfg, embeddings=emb)` and call `add_texts`, `add_documents`, or use `as_retriever`. 4. Run `similarity_search` or plug the retriever into your LangChain pipeline. > See `notebooks/` for end-to-end walkthroughs and the `libs/envector` package for implementation details. @@ -41,10 +41,30 @@ Key dataclasses live in `libs/envector/config.py`: - Manual item IDs are not accepted; returned IDs from `add_texts` are ephemeral. - Filtering happens client-side; ensure metadata is JSON for structured filters. +## Examples +- Add documents (from LangChain Documents): + - Python + - from langchain_core.documents import Document + - docs = [ + Document(page_content="chunk-1", metadata={"source": "paper.pdf", "page": 1, "chunk": 0}), + Document(page_content="chunk-2", metadata={"source": "paper.pdf", "page": 1, "chunk": 1}), + ] + - store = Envector(config=cfg, embeddings=emb) + - store.add_documents(docs) + ## Troubleshooting - Connection issues: verify ES2 address and registered keys. - Embeddings mismatch: ensure embedding dimension equals `index.dim` when supplying vectors. - Unexpected raw strings: confirm inserts used the JSON envelope. +## Testing Without ES2 +- Run unit tests offline (no ES2 or SDK required): + - `python -m pytest -q -m "not integration"` + - or `python run_unit_tests.py` +- Run integration tests (requires server and keys): + - Export `ES2_ADDRESS`, `ES2_KEY_PATH`, `ES2_KEY_ID` + - Optional: `ES2_USE_EMBEDDINGS=1`, `ES2_EMB_MODEL`, `ES2_USE_HF_DATASET=1` + - `python -m pytest -q -m integration -s` + ## Contributing See [`CONTRIBUTE.md`](CONTRIBUTE.md) for development, testing, and PR guidelines. diff --git a/VECTORSTORE.md b/VECTORSTORE.md new file mode 100644 index 0000000..7bcfd3b --- /dev/null +++ b/VECTORSTORE.md @@ -0,0 +1,74 @@ +# LangChain VectorStore API 지원 현황 + +이 문서는 LangChain의 VectorStore 인터페이스와 현재 envector에서 지원하는 기능들을 비교 분석한 결과입니다. + +## API 지원 현황 테이블 + +| 메서드 | 설명 | 현재 상태 | 비고 | +|--------|------|-----------|------| +| **문서 추가/관리** | +| `add_documents(documents)` | Document 객체로 문서 추가 | 🔧 구현 가능 | `add_texts` 래핑으로 구현 가능 | +| `add_texts(texts, metadatas, ids)` | 텍스트로 직접 추가 | ✅ 구현됨 | 완전 지원 | +| `add_documents(documents)` | 문서 추가 | ✅ 지원 | `add_texts` 위임, 임베딩/벡터 경로 지원 | +| `upsert_documents(documents)` | 문서 추가/업데이트 | ❌ 구현 불가 | ES2 SDK 제한으로 불가능 | +| `upsert_texts(texts, metadatas, ids)` | 텍스트 추가/업데이트 | ❌ 구현 불가 | ES2 SDK 제한으로 불가능 | +| **문서 삭제** | +| `delete(ids)` | ID로 문서 삭제 | ❌ 구현 불가 | ES2 SDK 제한으로 불가능 | +| `delete_documents(documents)` | Document 객체로 삭제 | ❌ 구현 불가 | ES2 SDK 제한으로 불가능 | +| **검색** | +| `similarity_search(query, k, filter)` | 유사도 검색 | ✅ 구현됨 | 완전 지원 | +| `similarity_search_with_score(query, k, filter)` | 점수와 함께 유사도 검색 | 🔧 구현 가능 | `_score`를 메타데이터로 제공 중 | +| `similarity_search_by_vector(embedding, k, filter)` | 벡터로 직접 검색 | ✅ 구현됨 | 완전 지원 | +| `similarity_search_with_score_by_vector(embedding, k, filter)` | 벡터로 점수와 함께 검색 | 🔧 구현 가능 | `_score`를 메타데이터로 제공 중 | +| **팩토리 메서드** | +| `from_texts(texts, embedding, metadatas)` | 텍스트로부터 생성 | ✅ 구현됨 | 완전 지원 | +| `from_documents(documents, embedding)` | Document로부터 생성 | ✅ 구현됨 | 완전 지원 | +| **기타** | +| `as_retriever(**kwargs)` | VectorStoreRetriever로 변환 | ✅ 구현됨 | 완전 지원 | + +### 범례 +- ✅ **구현됨**: 현재 완전히 구현되어 사용 가능 +- 🔧 **구현 가능**: 현재 구현되지 않았지만 기술적으로 구현 가능 +- ❌ **구현 불가**: ES2 SDK 제한으로 인해 구현 불가능 + +## 지원 현황 요약 + +### ✅ 구현됨 (6개) +- `add_texts` - 텍스트 추가 +- `similarity_search` - 유사도 검색 +- `similarity_search_by_vector` - 벡터 검색 +- `from_texts` - 팩토리 메서드 +- `from_documents` - 팩토리 메서드 +- `as_retriever` - 리트리버 변환 + +### 🔧 구현 가능 (3개) +- `add_documents` - Document 객체 추가 (래핑으로 구현 가능) +- `similarity_search_with_score` - 점수와 함께 검색 (현재 `_score` 메타데이터로 제공) +- `similarity_search_with_score_by_vector` - 벡터로 점수와 함께 검색 (현재 `_score` 메타데이터로 제공) + +### ❌ 구현 불가 (4개) +- `add_documents` - Document 리스트 삽입 (지원) +- `upsert_documents` - 문서 업서트 (ES2 SDK 제한) +- `upsert_texts` - 텍스트 업서트 (ES2 SDK 제한) +- `delete` - ID로 삭제 (ES2 SDK 제한) +- `delete_documents` - Document 삭제 (ES2 SDK 제한) + +## 주요 제한사항 + +1. **개별 문서 삭제/업데이트 불가**: envector는 개별 문서의 삭제나 업데이트를 지원하지 않습니다. 전체 인덱스를 삭제해야 합니다. +2. **IDs 무시**: `add_documents`/`add_texts`에서 사용자 제공 ID는 무시됩니다. 반환값은 서버 영속 ID가 아닌 일시적 식별자입니다. + +2. **upsert 기능 없음**: 문서의 추가/업데이트를 한 번에 처리하는 upsert 기능이 없습니다. + +3. **점수 반환 방식**: `similarity_search_with_score` 메서드는 없지만, `similarity_search`에서 `_score`를 메타데이터로 제공합니다. + +## 사용 권장사항 + +- **문서 추가**: `add_texts` 메서드 사용 +- **검색**: `similarity_search` 또는 `similarity_search_by_vector` 사용 +- **점수 확인**: 검색 결과의 `metadata['_score']`에서 점수 확인 +- **RAG 파이프라인**: `as_retriever()`를 사용하여 LangChain의 RAG 워크플로우에 통합 + +## 호환성 + +envector는 LangChain의 핵심 VectorStore 기능을 지원하여 기본적인 RAG(Retrieval-Augmented Generation) 워크플로우를 구현하는 데 충분합니다. 다만 개별 문서 관리가 필요한 경우에는 다른 VectorStore 구현체를 고려해야 합니다. diff --git a/es2-msa b/es2-msa new file mode 120000 index 0000000..44b78b8 --- /dev/null +++ b/es2-msa @@ -0,0 +1 @@ +/Users/inkme/git/es2-msa \ No newline at end of file diff --git a/libs/envector/langchain_envector/client.py b/libs/envector/langchain_envector/client.py index c8b2952..6329c27 100644 --- a/libs/envector/langchain_envector/client.py +++ b/libs/envector/langchain_envector/client.py @@ -79,4 +79,3 @@ def es2(self): if self._es2 is None: raise RuntimeError("Client not initialized. Call init().") return self._es2 - diff --git a/libs/envector/langchain_envector/vectorstore.py b/libs/envector/langchain_envector/vectorstore.py index b56d0a0..0011deb 100644 --- a/libs/envector/langchain_envector/vectorstore.py +++ b/libs/envector/langchain_envector/vectorstore.py @@ -178,6 +178,28 @@ def similarity_search_by_vector( # ------------------------------- # Class constructors (LangChain compatibility) # ------------------------------- + def add_documents( + self, + documents: List[Document], + ids: Optional[List[str]] = None, + *, + vectors: Optional[List[List[float]]] = None, + **kwargs: Any, + ) -> List[int]: + """Insert a list of Documents. + + Mirrors LangChain's VectorStore API. Delegates to `add_texts` by + extracting `page_content` and `metadata` from each Document. + + Notes: + - Manual `ids` are ignored (ES2 does not support user-provided IDs). + - When `embeddings` is not configured, you must supply `vectors`. + - Returns ephemeral IDs as produced by the client insert. + """ + texts = [getattr(d, "page_content", "") for d in documents] + metadatas = [getattr(d, "metadata", {}) for d in documents] + return self.add_texts(texts=texts, metadatas=metadatas, ids=ids, vectors=vectors, **kwargs) + @classmethod def from_texts( cls, diff --git a/run_unit_tests.py b/run_unit_tests.py new file mode 100644 index 0000000..35665c7 --- /dev/null +++ b/run_unit_tests.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +"""Run unit tests without requiring a running Envector (ES2) server. + +This script runs pytest while excluding tests marked as `integration`. +It is safe to use in environments without the es2 SDK or server. +""" + +import sys +import subprocess + + +def main() -> int: + cmd = [sys.executable, "-m", "pytest", "-q", "-m", "not integration"] + try: + return subprocess.call(cmd) + except FileNotFoundError: + print("pytest not found. Install with: python -m pip install pytest", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py index a1bb8c7..50c4920 100644 --- a/tests/test_vectorstore.py +++ b/tests/test_vectorstore.py @@ -3,7 +3,7 @@ import re from langchain_envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig -from langchain_envector.vectorstore import Envector +from langchain_envector.vectorstore import Envector, Document as LC_Document from .conftest import FakeClient, FakeEmbeddings, FakeIndex @@ -96,3 +96,94 @@ def test_similarity_search_handles_python_literal_metadata(): # dict-type metadata is not supported currently; only text-based + + +def test_similarity_search_by_vector_with_filter_and_threshold(): + index = FakeIndex() + index.search_payload = [[ + {"id": "v-0", "score": 0.88, "metadata": "{\"text\": \"Keep\", \"metadata\": {\"k\": 1}}"}, + {"id": "v-1", "score": 0.30, "metadata": "{\"text\": \"Drop\", \"metadata\": {\"k\": 2}}"}, + ]] + client = FakeClient(index) + store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) + + # Explicit vector search (bypasses embed_query), with filter/threshold + docs = store.similarity_search_by_vector([0.0, 0.0, 0.0, 0.0], k=5, filter={"k": 1}, score_threshold=0.5) + assert len(docs) == 1 + assert docs[0].page_content == "Keep" + assert docs[0].metadata["_score"] >= 0.5 + + +def test_from_texts_inserts_using_embeddings(): + client = FakeClient() + store = Envector.from_texts( + ["A", "B"], + metadatas=[{"m": "a"}, {"m": "b"}], + embeddings=FakeEmbeddings(dim=4), + config=_cfg(), + client=client, + ) + assert isinstance(store, Envector) + # One batch inserted + assert len(client.index.inserted) == 1 + # Two items packed + assert len(client.index.inserted[0]["metadata"]) == 2 + + +def test_from_documents_paths_through_to_texts(): + client = FakeClient() + docs = [ + LC_Document(page_content="X", metadata={"a": 1}), + LC_Document(page_content="Y", metadata={"a": 2}), + ] + store = Envector.from_documents(docs, embeddings=FakeEmbeddings(dim=4), config=_cfg(), client=client) + assert isinstance(store, Envector) + assert len(client.index.inserted) == 1 + packed = client.index.inserted[0]["metadata"] + # Texts preserved + assert any("\"text\": \"X\"" in m for m in packed) + assert any("\"text\": \"Y\"" in m for m in packed) + + +def test_add_documents_with_embeddings(): + client = FakeClient() + store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) + + docs = [ + LC_Document(page_content="C1", metadata={"s": 1}), + LC_Document(page_content="C2", metadata={"s": 2}), + ] + ret = store.add_documents(docs) + assert len(ret) == 2 + assert len(client.index.inserted) == 1 + packed = client.index.inserted[0]["metadata"] + assert any("\"text\": \"C1\"" in m for m in packed) + assert any("\"text\": \"C2\"" in m for m in packed) + + +def test_add_documents_requires_vectors_when_no_embeddings(): + client = FakeClient() + store = Envector(config=_cfg(), embeddings=None, client=client) + docs = [LC_Document(page_content="C", metadata={})] + try: + store.add_documents(docs) + assert False, "Expected ValueError when embeddings is None and no vectors provided" + except ValueError as e: + assert "embeddings is None and vectors not provided" in str(e) + + +def test_add_documents_with_explicit_vectors(): + client = FakeClient() + store = Envector(config=_cfg(), embeddings=None, client=client) + + docs = [ + LC_Document(page_content="V1", metadata={"k": "a"}), + LC_Document(page_content="V2", metadata={"k": "b"}), + ] + vecs = [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + ] + ret = store.add_documents(docs, vectors=vecs) + assert len(ret) == 2 + assert len(client.index.inserted) == 1 From afd1c1f3a6bff0c9c293aa4306206d7cc7682c14 Mon Sep 17 00:00:00 2001 From: inkme Date: Sun, 12 Oct 2025 13:39:42 +0000 Subject: [PATCH 02/13] chore: ignore VECTORSTORE.md --- .gitignore | 10 +++++++ VECTORSTORE.md | 74 ----------------------------------------------- es2-msa | 1 - run_unit_tests.py | 23 --------------- 4 files changed, 10 insertions(+), 98 deletions(-) delete mode 100644 VECTORSTORE.md delete mode 120000 es2-msa delete mode 100644 run_unit_tests.py diff --git a/.gitignore b/.gitignore index 8a933d1..eca98cc 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,16 @@ Thumbs.db # Sensitive local data keys/ +VECTORSTORE.md + +# External symlinks (local workspace references) +es2-msa +es2-msa/ +es2-deploy +es2-deploy/ + +# Local helper scripts +run_unit_tests.py # Jupyter .ipynb_checkpoints/ diff --git a/VECTORSTORE.md b/VECTORSTORE.md deleted file mode 100644 index 7bcfd3b..0000000 --- a/VECTORSTORE.md +++ /dev/null @@ -1,74 +0,0 @@ -# LangChain VectorStore API 지원 현황 - -이 문서는 LangChain의 VectorStore 인터페이스와 현재 envector에서 지원하는 기능들을 비교 분석한 결과입니다. - -## API 지원 현황 테이블 - -| 메서드 | 설명 | 현재 상태 | 비고 | -|--------|------|-----------|------| -| **문서 추가/관리** | -| `add_documents(documents)` | Document 객체로 문서 추가 | 🔧 구현 가능 | `add_texts` 래핑으로 구현 가능 | -| `add_texts(texts, metadatas, ids)` | 텍스트로 직접 추가 | ✅ 구현됨 | 완전 지원 | -| `add_documents(documents)` | 문서 추가 | ✅ 지원 | `add_texts` 위임, 임베딩/벡터 경로 지원 | -| `upsert_documents(documents)` | 문서 추가/업데이트 | ❌ 구현 불가 | ES2 SDK 제한으로 불가능 | -| `upsert_texts(texts, metadatas, ids)` | 텍스트 추가/업데이트 | ❌ 구현 불가 | ES2 SDK 제한으로 불가능 | -| **문서 삭제** | -| `delete(ids)` | ID로 문서 삭제 | ❌ 구현 불가 | ES2 SDK 제한으로 불가능 | -| `delete_documents(documents)` | Document 객체로 삭제 | ❌ 구현 불가 | ES2 SDK 제한으로 불가능 | -| **검색** | -| `similarity_search(query, k, filter)` | 유사도 검색 | ✅ 구현됨 | 완전 지원 | -| `similarity_search_with_score(query, k, filter)` | 점수와 함께 유사도 검색 | 🔧 구현 가능 | `_score`를 메타데이터로 제공 중 | -| `similarity_search_by_vector(embedding, k, filter)` | 벡터로 직접 검색 | ✅ 구현됨 | 완전 지원 | -| `similarity_search_with_score_by_vector(embedding, k, filter)` | 벡터로 점수와 함께 검색 | 🔧 구현 가능 | `_score`를 메타데이터로 제공 중 | -| **팩토리 메서드** | -| `from_texts(texts, embedding, metadatas)` | 텍스트로부터 생성 | ✅ 구현됨 | 완전 지원 | -| `from_documents(documents, embedding)` | Document로부터 생성 | ✅ 구현됨 | 완전 지원 | -| **기타** | -| `as_retriever(**kwargs)` | VectorStoreRetriever로 변환 | ✅ 구현됨 | 완전 지원 | - -### 범례 -- ✅ **구현됨**: 현재 완전히 구현되어 사용 가능 -- 🔧 **구현 가능**: 현재 구현되지 않았지만 기술적으로 구현 가능 -- ❌ **구현 불가**: ES2 SDK 제한으로 인해 구현 불가능 - -## 지원 현황 요약 - -### ✅ 구현됨 (6개) -- `add_texts` - 텍스트 추가 -- `similarity_search` - 유사도 검색 -- `similarity_search_by_vector` - 벡터 검색 -- `from_texts` - 팩토리 메서드 -- `from_documents` - 팩토리 메서드 -- `as_retriever` - 리트리버 변환 - -### 🔧 구현 가능 (3개) -- `add_documents` - Document 객체 추가 (래핑으로 구현 가능) -- `similarity_search_with_score` - 점수와 함께 검색 (현재 `_score` 메타데이터로 제공) -- `similarity_search_with_score_by_vector` - 벡터로 점수와 함께 검색 (현재 `_score` 메타데이터로 제공) - -### ❌ 구현 불가 (4개) -- `add_documents` - Document 리스트 삽입 (지원) -- `upsert_documents` - 문서 업서트 (ES2 SDK 제한) -- `upsert_texts` - 텍스트 업서트 (ES2 SDK 제한) -- `delete` - ID로 삭제 (ES2 SDK 제한) -- `delete_documents` - Document 삭제 (ES2 SDK 제한) - -## 주요 제한사항 - -1. **개별 문서 삭제/업데이트 불가**: envector는 개별 문서의 삭제나 업데이트를 지원하지 않습니다. 전체 인덱스를 삭제해야 합니다. -2. **IDs 무시**: `add_documents`/`add_texts`에서 사용자 제공 ID는 무시됩니다. 반환값은 서버 영속 ID가 아닌 일시적 식별자입니다. - -2. **upsert 기능 없음**: 문서의 추가/업데이트를 한 번에 처리하는 upsert 기능이 없습니다. - -3. **점수 반환 방식**: `similarity_search_with_score` 메서드는 없지만, `similarity_search`에서 `_score`를 메타데이터로 제공합니다. - -## 사용 권장사항 - -- **문서 추가**: `add_texts` 메서드 사용 -- **검색**: `similarity_search` 또는 `similarity_search_by_vector` 사용 -- **점수 확인**: 검색 결과의 `metadata['_score']`에서 점수 확인 -- **RAG 파이프라인**: `as_retriever()`를 사용하여 LangChain의 RAG 워크플로우에 통합 - -## 호환성 - -envector는 LangChain의 핵심 VectorStore 기능을 지원하여 기본적인 RAG(Retrieval-Augmented Generation) 워크플로우를 구현하는 데 충분합니다. 다만 개별 문서 관리가 필요한 경우에는 다른 VectorStore 구현체를 고려해야 합니다. diff --git a/es2-msa b/es2-msa deleted file mode 120000 index 44b78b8..0000000 --- a/es2-msa +++ /dev/null @@ -1 +0,0 @@ -/Users/inkme/git/es2-msa \ No newline at end of file diff --git a/run_unit_tests.py b/run_unit_tests.py deleted file mode 100644 index 35665c7..0000000 --- a/run_unit_tests.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -"""Run unit tests without requiring a running Envector (ES2) server. - -This script runs pytest while excluding tests marked as `integration`. -It is safe to use in environments without the es2 SDK or server. -""" - -import sys -import subprocess - - -def main() -> int: - cmd = [sys.executable, "-m", "pytest", "-q", "-m", "not integration"] - try: - return subprocess.call(cmd) - except FileNotFoundError: - print("pytest not found. Install with: python -m pip install pytest", file=sys.stderr) - return 1 - - -if __name__ == "__main__": - raise SystemExit(main()) - From 78e6815f525a662465ef9a013a7906e4c00378ba Mon Sep 17 00:00:00 2001 From: inkme Date: Sun, 12 Oct 2025 13:57:07 +0000 Subject: [PATCH 03/13] ci: add PR checks workflow --- .github/workflows/pr.yml | 35 ++++++ .pre-commit-config.yaml | 10 ++ libs/envector/examples/basic_usage.py | 12 +- libs/envector/examples/cipher_query.py | 16 ++- libs/envector/examples/ingest_synthetic_1k.py | 22 +++- libs/envector/langchain_envector/__init__.py | 9 +- libs/envector/langchain_envector/client.py | 6 +- libs/envector/langchain_envector/config.py | 1 - libs/envector/langchain_envector/retriever.py | 4 +- libs/envector/langchain_envector/types.py | 24 +++- .../langchain_envector/vectorstore.py | 39 ++++-- scripts/export_hf_dataset.py | 17 ++- scripts/make_synthetic_rag_dataset.py | 4 +- scripts/run_unit_tests.py | 2 - tests/__init__.py | 1 - tests/conftest.py | 3 +- tests/integration/test_es2_integration.py | 73 +++++++++--- tests/test_types.py | 2 - tests/test_vectorstore.py | 112 +++++++++++++----- 19 files changed, 292 insertions(+), 100 deletions(-) create mode 100644 .github/workflows/pr.yml create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml new file mode 100644 index 0000000..33e75a2 --- /dev/null +++ b/.github/workflows/pr.yml @@ -0,0 +1,35 @@ +name: PR Checks + +on: + pull_request: + branches: + - main + +concurrency: + group: pr-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + checks: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install -e . + python -m pip install pytest pre-commit + + - name: Lint and format + run: pre-commit run --all-files --show-diff-on-failure + + - name: Run unit tests + run: python -m pytest -q -m "not integration" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..00caf48 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,10 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.7.1 + hooks: + - id: ruff + - repo: https://github.com/psf/black + rev: 24.10.0 + hooks: + - id: black + language_version: python3.11 diff --git a/libs/envector/examples/basic_usage.py b/libs/envector/examples/basic_usage.py index 796118b..7936ece 100644 --- a/libs/envector/examples/basic_usage.py +++ b/libs/envector/examples/basic_usage.py @@ -8,7 +8,12 @@ from __future__ import annotations -from libs.envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig +from libs.envector.config import ( + ConnectionConfig, + EnvectorConfig, + IndexSettings, + KeyConfig, +) from libs.envector.vectorstore import Envector @@ -16,7 +21,9 @@ def main(): # Replace with your actual settings cfg = EnvectorConfig( connection=ConnectionConfig(address="localhost:50050"), - key=KeyConfig(key_path="./keys", key_id="example_key", preset="ip", eval_mode="rmp"), + key=KeyConfig( + key_path="./keys", key_id="example_key", preset="ip", eval_mode="rmp" + ), index=IndexSettings(index_name="demo", dim=384, query_encryption="plain"), create_if_missing=True, ) @@ -43,4 +50,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/libs/envector/examples/cipher_query.py b/libs/envector/examples/cipher_query.py index d7defc3..64fe174 100644 --- a/libs/envector/examples/cipher_query.py +++ b/libs/envector/examples/cipher_query.py @@ -6,15 +6,24 @@ from __future__ import annotations -from libs.envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig +from libs.envector.config import ( + ConnectionConfig, + EnvectorConfig, + IndexSettings, + KeyConfig, +) from libs.envector.vectorstore import Envector def main(): cfg = EnvectorConfig( connection=ConnectionConfig(address="localhost:50050"), - key=KeyConfig(key_path="./keys", key_id="example_key", preset="ip", eval_mode="rmp"), - index=IndexSettings(index_name="demo_cipher", dim=384, query_encryption="cipher"), + key=KeyConfig( + key_path="./keys", key_id="example_key", preset="ip", eval_mode="rmp" + ), + index=IndexSettings( + index_name="demo_cipher", dim=384, query_encryption="cipher" + ), create_if_missing=True, ) @@ -38,4 +47,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/libs/envector/examples/ingest_synthetic_1k.py b/libs/envector/examples/ingest_synthetic_1k.py index dbb0e31..8b055c8 100644 --- a/libs/envector/examples/ingest_synthetic_1k.py +++ b/libs/envector/examples/ingest_synthetic_1k.py @@ -19,7 +19,12 @@ from pathlib import Path from typing import List -from libs.envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig +from libs.envector.config import ( + ConnectionConfig, + EnvectorConfig, + IndexSettings, + KeyConfig, +) from libs.envector.vectorstore import Envector @@ -34,7 +39,12 @@ def main(): ap.add_argument("--key-path", required=True) ap.add_argument("--key-id", required=True) ap.add_argument("--index-name", required=True) - ap.add_argument("--dim", type=int, required=False, help="If omitted and --use-embeddings, infer from model.") + ap.add_argument( + "--dim", + type=int, + required=False, + help="If omitted and --use-embeddings, infer from model.", + ) ap.add_argument("--dataset", default="data/synthetic_rag_1k.jsonl") ap.add_argument("--use-embeddings", action="store_true") ap.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2") @@ -52,7 +62,9 @@ def main(): cfg = EnvectorConfig( connection=ConnectionConfig(address=args.address), - key=KeyConfig(key_path=args.key_path, key_id=args.key_id, preset="ip", eval_mode="rmp"), + key=KeyConfig( + key_path=args.key_path, key_id=args.key_id, preset="ip", eval_mode="rmp" + ), index=IndexSettings( index_name=args.index_name, dim=(args.dim if args.dim is not None else inferred_dim or 0), @@ -76,7 +88,9 @@ def main(): if embeddings is None: # Without embeddings, require manual vectors; here we simply skip. # Users should provide --use-embeddings or adapt to their vector source. - raise ValueError("--use-embeddings is required unless you provide vectors explicitly.") + raise ValueError( + "--use-embeddings is required unless you provide vectors explicitly." + ) store.add_texts(t_batch, metadatas=m_batch) print(f"Inserted {len(texts)} documents into index '{args.index_name}'") diff --git a/libs/envector/langchain_envector/__init__.py b/libs/envector/langchain_envector/__init__.py index a92cf2b..cf7a9b6 100644 --- a/libs/envector/langchain_envector/__init__.py +++ b/libs/envector/langchain_envector/__init__.py @@ -7,5 +7,10 @@ from .vectorstore import Envector from .config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig -__all__ = ["Envector", "ConnectionConfig", "EnvectorConfig", "IndexSettings", "KeyConfig"] - +__all__ = [ + "Envector", + "ConnectionConfig", + "EnvectorConfig", + "IndexSettings", + "KeyConfig", +] diff --git a/libs/envector/langchain_envector/client.py b/libs/envector/langchain_envector/client.py index 6329c27..c3ac2f9 100644 --- a/libs/envector/langchain_envector/client.py +++ b/libs/envector/langchain_envector/client.py @@ -1,7 +1,5 @@ from __future__ import annotations -from typing import Optional - from .config import EnvectorConfig @@ -34,7 +32,9 @@ def init(self): else: if not (c.host and c.port): raise ValueError("Either address or host+port must be provided.") - es2_client.init_connect(host=c.host, port=c.port, access_token=c.access_token) + es2_client.init_connect( + host=c.host, port=c.port, access_token=c.access_token + ) # Key path baseline for Index from es2.index import Index as _Index diff --git a/libs/envector/langchain_envector/config.py b/libs/envector/langchain_envector/config.py index b6be7c0..62e5291 100644 --- a/libs/envector/langchain_envector/config.py +++ b/libs/envector/langchain_envector/config.py @@ -39,4 +39,3 @@ class EnvectorConfig: key: KeyConfig index: IndexSettings create_if_missing: bool = True - diff --git a/libs/envector/langchain_envector/retriever.py b/libs/envector/langchain_envector/retriever.py index ee578bc..255471c 100644 --- a/libs/envector/langchain_envector/retriever.py +++ b/libs/envector/langchain_envector/retriever.py @@ -12,7 +12,9 @@ class EnvectorRetriever: - def __init__(self, store: Envector, *, search_kwargs: Optional[Dict[str, Any]] = None) -> None: + def __init__( + self, store: Envector, *, search_kwargs: Optional[Dict[str, Any]] = None + ) -> None: self.store = store self.search_kwargs = search_kwargs or {} diff --git a/libs/envector/langchain_envector/types.py b/libs/envector/langchain_envector/types.py index e4a82ae..5ad4591 100644 --- a/libs/envector/langchain_envector/types.py +++ b/libs/envector/langchain_envector/types.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union, overload +from typing import Any, Callable, Dict, List, Optional, Protocol class Embeddings(Protocol): @@ -10,10 +10,14 @@ class Embeddings(Protocol): LangChain-compatible embeddings typically implement these two methods. """ - def embed_documents(self, texts: List[str]) -> List[List[float]]: # pragma: no cover - interface only + def embed_documents( + self, texts: List[str] + ) -> List[List[float]]: # pragma: no cover - interface only ... - def embed_query(self, text: str) -> List[float]: # pragma: no cover - interface only + def embed_query( + self, text: str + ) -> List[float]: # pragma: no cover - interface only ... @@ -94,8 +98,13 @@ def unpack_metadata(raw: Any) -> Dict[str, Any]: # --- Embeddings adaptation helpers ----------------------------------------------------- + class _CallableEmbeddings: - def __init__(self, docs_fn: Callable[[List[str]], List[List[float]]], query_fn: Callable[[str], List[float]]): + def __init__( + self, + docs_fn: Callable[[List[str]], List[List[float]]], + query_fn: Callable[[str], List[float]], + ): self._docs_fn = docs_fn self._query_fn = query_fn @@ -132,7 +141,12 @@ def query_fn(text: str) -> List[float]: return _CallableEmbeddings(docs_fn, query_fn) # Case 3: Tuple of callables - if isinstance(emb, tuple) and len(emb) == 2 and callable(emb[0]) and callable(emb[1]): + if ( + isinstance(emb, tuple) + and len(emb) == 2 + and callable(emb[0]) + and callable(emb[1]) + ): docs_fn, query_fn = emb # type: ignore[assignment] return _CallableEmbeddings(docs_fn, query_fn) diff --git a/libs/envector/langchain_envector/vectorstore.py b/libs/envector/langchain_envector/vectorstore.py index 0011deb..0350af9 100644 --- a/libs/envector/langchain_envector/vectorstore.py +++ b/libs/envector/langchain_envector/vectorstore.py @@ -1,9 +1,6 @@ from __future__ import annotations -import json -from typing import Any, Dict, Iterable, List, Optional, Sequence -from uuid import uuid4 - +from typing import Any, Dict, List, Optional from .config import EnvectorConfig from .client import EnvectorClient from .types import Embeddings, as_embeddings, pack_metadata, unpack_metadata @@ -21,12 +18,15 @@ def _try_import_langchain(): except Exception: # pragma: no cover - optional dependency # Minimal shim if LangChain is not installed class Document: # type: ignore - def __init__(self, page_content: str, metadata: Optional[Dict[str, Any]] = None): + def __init__( + self, page_content: str, metadata: Optional[Dict[str, Any]] = None + ): self.page_content = page_content self.metadata = metadata or {} try: from langchain_core.vectorstores import VectorStore as _VectorStore # type: ignore + VectorStoreBase = _VectorStore except Exception: # pragma: no cover - optional dependency pass @@ -119,9 +119,15 @@ def similarity_search( top_k = fetch_k or self.config.index.fetch_k or k - results = self.client.index.search(query=embedding, top_k=top_k, output_fields=self.config.index.output_fields) + results = self.client.index.search( + query=embedding, top_k=top_k, output_fields=self.config.index.output_fields + ) # ES2 Index.search returns a list for each query; we passed single query - result = results[0] if isinstance(results, list) and results and isinstance(results[0], list) else results + result = ( + results[0] + if isinstance(results, list) and results and isinstance(results[0], list) + else results + ) docs = [] # Iterate from top-1 to top-k @@ -129,7 +135,7 @@ def similarity_search( # item = {"id": ..., "score": float, "metadata": [str] or {...}} score = float(item.get("score", 0.0)) md_obj_raw = item.get("metadata") - + # Metadata encryption/decryption is handled by the SDK. # Envector currently supports a single associated data field (string). # Convention: if the string is JSON like {"text": str, "metadata": {...}}, @@ -148,7 +154,10 @@ def similarity_search( if score_threshold is not None and score < score_threshold: continue - doc = Document(page_content=text, metadata={**metadata, "_score": score, "_id": item.get("id")}) + doc = Document( + page_content=text, + metadata={**metadata, "_score": score, "_id": item.get("id")}, + ) docs.append(doc) # Trim to k after filtering @@ -198,7 +207,9 @@ def add_documents( """ texts = [getattr(d, "page_content", "") for d in documents] metadatas = [getattr(d, "metadata", {}) for d in documents] - return self.add_texts(texts=texts, metadatas=metadatas, ids=ids, vectors=vectors, **kwargs) + return self.add_texts( + texts=texts, metadatas=metadatas, ids=ids, vectors=vectors, **kwargs + ) @classmethod def from_texts( @@ -233,7 +244,9 @@ def from_documents( ) -> "Envector": # type: ignore[override] texts = [d.page_content for d in documents] metadatas = [getattr(d, "metadata", {}) for d in documents] - return cls.from_texts(texts=texts, metadatas=metadatas, embeddings=embeddings, **kwargs) + return cls.from_texts( + texts=texts, metadatas=metadatas, embeddings=embeddings, **kwargs + ) # Optional: if LangChain is installed, this will be used; otherwise, users may call similarity_search directly. def as_retriever(self, **kwargs: Any): # pragma: no cover - wrapper @@ -244,7 +257,9 @@ def as_retriever(self, **kwargs: Any): # pragma: no cover - wrapper except Exception: # Minimal shim if VectorStoreRetriever is unavailable class _Retriever: - def __init__(self, vs: Envector, search_kwargs: Optional[Dict[str, Any]] = None): + def __init__( + self, vs: Envector, search_kwargs: Optional[Dict[str, Any]] = None + ): self.vs = vs self.search_kwargs = search_kwargs or {} diff --git a/scripts/export_hf_dataset.py b/scripts/export_hf_dataset.py index d49f299..88de37e 100644 --- a/scripts/export_hf_dataset.py +++ b/scripts/export_hf_dataset.py @@ -17,16 +17,22 @@ import argparse import json from pathlib import Path -from typing import List def main(): ap = argparse.ArgumentParser() ap.add_argument("--name", required=True, help="HF dataset name, e.g., ag_news") - ap.add_argument("--subset", default=None, help="Optional subset/config of the dataset") + ap.add_argument( + "--subset", default=None, help="Optional subset/config of the dataset" + ) ap.add_argument("--split", default="train") ap.add_argument("--text-column", required=True) - ap.add_argument("--meta-columns", nargs="*", default=[], help="Optional metadata columns to carry over") + ap.add_argument( + "--meta-columns", + nargs="*", + default=[], + help="Optional metadata columns to carry over", + ) ap.add_argument("--size", type=int, default=1000) ap.add_argument("--seed", type=int, default=42) ap.add_argument("--out", default="data/hf_export.jsonl") @@ -47,7 +53,9 @@ def main(): with out_path.open("w", encoding="utf-8") as f: for row in ds: text = row[args.text_column] - meta = {k: row.get(k) for k in args.meta_columns} if args.meta_columns else {} + meta = ( + {k: row.get(k) for k in args.meta_columns} if args.meta_columns else {} + ) rec = {"text": text, "metadata": meta} f.write(json.dumps(rec, ensure_ascii=False) + "\n") @@ -56,4 +64,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/scripts/make_synthetic_rag_dataset.py b/scripts/make_synthetic_rag_dataset.py index 8badf1c..b9f4b61 100644 --- a/scripts/make_synthetic_rag_dataset.py +++ b/scripts/make_synthetic_rag_dataset.py @@ -11,7 +11,6 @@ import argparse import json -import os import random from pathlib import Path @@ -59,7 +58,7 @@ def make_sentence(topic: str) -> str: def make_paragraph(topic: str, min_sent: int = 3, max_sent: int = 7) -> str: n = random.randint(min_sent, max_sent) - return " " .join(make_sentence(topic) for _ in range(n)) + return " ".join(make_sentence(topic) for _ in range(n)) def main(): @@ -86,4 +85,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/scripts/run_unit_tests.py b/scripts/run_unit_tests.py index ecd1cdc..3370e56 100644 --- a/scripts/run_unit_tests.py +++ b/scripts/run_unit_tests.py @@ -2,7 +2,6 @@ import importlib import inspect -import sys import traceback @@ -43,4 +42,3 @@ def main() -> int: if __name__ == "__main__": raise SystemExit(main()) - diff --git a/tests/__init__.py b/tests/__init__.py index c10059a..070d470 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -13,4 +13,3 @@ pkg_path = str(_PKG_DIR) if pkg_path not in sys.path: sys.path.insert(0, pkg_path) - diff --git a/tests/conftest.py b/tests/conftest.py index 47172b6..fe7954e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,8 +24,7 @@ class FakeIndex: def insert(self, data: List[List[float]], metadata: List[str]): self.inserted.append({"data": data, "metadata": metadata}) - batch_idx = len(self.inserted) - 1 - return [len(self.inserted)+i+1 for i in range(len(metadata))] + return [len(self.inserted) + i + 1 for i in range(len(metadata))] def search(self, query: List[float], top_k: int, output_fields: List[str]): if self.search_payload is not None: diff --git a/tests/integration/test_es2_integration.py b/tests/integration/test_es2_integration.py index c43661c..a2bf967 100644 --- a/tests/integration/test_es2_integration.py +++ b/tests/integration/test_es2_integration.py @@ -5,7 +5,12 @@ import time import pytest -from langchain_envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig +from langchain_envector.config import ( + ConnectionConfig, + EnvectorConfig, + IndexSettings, + KeyConfig, +) from langchain_envector.vectorstore import Envector @@ -33,13 +38,17 @@ def test_e2e_vectorstore_plain_and_cipher(): key_path = _require_env("ES2_KEY_PATH") key_id = _require_env("ES2_KEY_ID") use_emb = os.environ.get("ES2_USE_EMBEDDINGS") in {"1", "true", "TRUE", "yes"} - model_name = os.environ.get("ES2_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2") + model_name = os.environ.get( + "ES2_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2" + ) use_hf = os.environ.get("ES2_USE_HF_DATASET") in {"1", "true", "TRUE", "yes"} hf_name = os.environ.get("ES2_HF_NAME", "ag_news") hf_subset = os.environ.get("ES2_HF_SUBSET") hf_split = os.environ.get("ES2_HF_SPLIT", "train") hf_text_col = os.environ.get("ES2_HF_TEXT_COL", "text") - hf_meta_cols = [c for c in os.environ.get("ES2_HF_META_COLS", "label").split(",") if c] + hf_meta_cols = [ + c for c in os.environ.get("ES2_HF_META_COLS", "label").split(",") if c + ] hf_size = int(os.environ.get("ES2_HF_SIZE", "200")) hf_seed = int(os.environ.get("ES2_HF_SEED", "42")) @@ -69,9 +78,12 @@ def test_e2e_vectorstore_plain_and_cipher(): if dim < 16 or dim > 4096: pytest.skip("Envector supports dimensions in [16, 4096]") - base_index_name = os.environ.get("ES2_INDEX_NAME", f"inttest_{secrets.token_hex(4)}") + base_index_name = os.environ.get( + "ES2_INDEX_NAME", f"inttest_{secrets.token_hex(4)}" + ) import es2 + es2.init_connect(address=address) es2.reset() @@ -79,7 +91,9 @@ def test_e2e_vectorstore_plain_and_cipher(): cfg_plain = EnvectorConfig( connection=ConnectionConfig(address=address), key=KeyConfig(key_path=key_path, key_id=key_id, preset="ip", eval_mode="rmp"), - index=IndexSettings(index_name=f"{base_index_name}_plain", dim=dim, query_encryption="plain"), + index=IndexSettings( + index_name=f"{base_index_name}_plain", dim=dim, query_encryption="plain" + ), create_if_missing=True, ) store_plain = Envector(config=cfg_plain, embeddings=(emb if use_emb else None)) @@ -93,14 +107,14 @@ def test_e2e_vectorstore_plain_and_cipher(): if hf_size and hf_size < len(ds): ds = ds.shuffle(seed=hf_seed).select(range(hf_size)) texts = [row[hf_text_col] for row in ds] - metas = [ - {k: row.get(k) for k in hf_meta_cols if k in row} - for row in ds - ] + metas = [{k: row.get(k) for k in hf_meta_cols if k in row} for row in ds] print(texts[0]) print(metas[0]) else: - texts = ["machine learning accelerates research", "cooking recipes are delicious"] + texts = [ + "machine learning accelerates research", + "cooking recipes are delicious", + ] metas = [{"label": "A"}, {"label": "B"}] if use_emb: @@ -120,21 +134,34 @@ def test_e2e_vectorstore_plain_and_cipher(): docs = store_plain.similarity_search(q1, k=3) print("[plain] top-3 results for:", q1) for d in docs: - print(" - score=", d.metadata.get("_score"), "text=", (d.page_content[:80] + ("..." if len(d.page_content) > 80 else ""))) + print( + " - score=", + d.metadata.get("_score"), + "text=", + (d.page_content[:80] + ("..." if len(d.page_content) > 80 else "")), + ) assert len(docs) >= 1 assert all("_id" in d.metadata for d in docs) # optional filter check if 'label' is part of meta if not use_hf: - docs_f = store_plain.similarity_search("cooking", k=2, filter={"label": "B"}) + docs_f = store_plain.similarity_search( + "cooking", k=2, filter={"label": "B"} + ) print("[plain] filtered results (label=B):", [d.metadata for d in docs_f]) - assert len(docs_f) >= 1 and all(d.metadata.get("label") == "B" for d in docs_f) + assert len(docs_f) >= 1 and all( + d.metadata.get("label") == "B" for d in docs_f + ) else: # Using explicit embeddings docs = store_plain.similarity_search("q", k=2, embedding=e1) - print("[plain] results (explicit embedding e1):", [d.page_content for d in docs]) + print( + "[plain] results (explicit embedding e1):", [d.page_content for d in docs] + ) assert any(d.page_content == texts[0] for d in docs) assert all("_id" in d.metadata for d in docs) - docs_f = store_plain.similarity_search("q", k=2, embedding=e2, filter={"label": "B"}) + docs_f = store_plain.similarity_search( + "q", k=2, embedding=e2, filter={"label": "B"} + ) print("[plain] filtered (e2, label=B):", [d.page_content for d in docs_f]) assert len(docs_f) >= 1 assert docs_f[0].page_content == texts[1] @@ -143,7 +170,9 @@ def test_e2e_vectorstore_plain_and_cipher(): cfg_cc = EnvectorConfig( connection=ConnectionConfig(address=address), key=KeyConfig(key_path=key_path, key_id=key_id, preset="ip", eval_mode="rmp"), - index=IndexSettings(index_name=f"{base_index_name}_cipher", dim=dim, query_encryption="cipher"), + index=IndexSettings( + index_name=f"{base_index_name}_cipher", dim=dim, query_encryption="cipher" + ), create_if_missing=True, ) store_cc = Envector(config=cfg_cc, embeddings=(emb if use_emb else None)) @@ -158,12 +187,20 @@ def test_e2e_vectorstore_plain_and_cipher(): docs_cc = store_cc.similarity_search(q2, k=3) print("[cipher] top-3 results for:", q2) for d in docs_cc: - print(" - score=", d.metadata.get("_score"), "text=", (d.page_content[:80] + ("..." if len(d.page_content) > 80 else ""))) + print( + " - score=", + d.metadata.get("_score"), + "text=", + (d.page_content[:80] + ("..." if len(d.page_content) > 80 else "")), + ) assert len(docs_cc) >= 1 assert all("_id" in d.metadata for d in docs_cc) else: docs_cc = store_cc.similarity_search("q", k=2, embedding=e2) - print("[cipher] results (explicit embedding e2):", [d.page_content for d in docs_cc]) + print( + "[cipher] results (explicit embedding e2):", + [d.page_content for d in docs_cc], + ) assert any(d.page_content == texts[1] for d in docs_cc) assert all("_id" in d.metadata for d in docs_cc) diff --git a/tests/test_types.py b/tests/test_types.py index c85c371..65cd546 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -1,7 +1,5 @@ from __future__ import annotations -import json - from langchain_envector.types import pack_metadata, unpack_metadata diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py index 50c4920..b572ee5 100644 --- a/tests/test_vectorstore.py +++ b/tests/test_vectorstore.py @@ -1,8 +1,11 @@ from __future__ import annotations -import re - -from langchain_envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig +from langchain_envector.config import ( + ConnectionConfig, + EnvectorConfig, + IndexSettings, + KeyConfig, +) from langchain_envector.vectorstore import Envector, Document as LC_Document from .conftest import FakeClient, FakeEmbeddings, FakeIndex @@ -20,7 +23,9 @@ def test_add_texts_ignores_ids_and_returns_item_ids(): client = FakeClient() store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) - ret_ids = store.add_texts(["t1", "t2"], metadatas=[{"m": 1}, {"m": 2}], ids=["a", "b"]) # ids ignored + ret_ids = store.add_texts( + ["t1", "t2"], metadatas=[{"m": 1}, {"m": 2}], ids=["a", "b"] + ) # ids ignored # Returned IDs assert len(ret_ids) == 2 @@ -30,20 +35,32 @@ def test_add_texts_ignores_ids_and_returns_item_ids(): assert len(client.index.inserted) == 1 packed = client.index.inserted[0]["metadata"] assert len(packed) == 2 - assert "\"id\"" not in packed[0] + assert '"id"' not in packed[0] def test_similarity_search_with_filter_and_threshold(): index = FakeIndex() # Two items, different scores and tags - index.search_payload = [[ - {"id": "pos-0", "score": 0.95, "metadata": "{\"text\": \"A\", \"metadata\": {\"tag\": \"keep\"}}"}, - {"id": "pos-1", "score": 0.40, "metadata": "{\"text\": \"B\", \"metadata\": {\"tag\": \"drop\"}}"}, - ]] + index.search_payload = [ + [ + { + "id": "pos-0", + "score": 0.95, + "metadata": '{"text": "A", "metadata": {"tag": "keep"}}', + }, + { + "id": "pos-1", + "score": 0.40, + "metadata": '{"text": "B", "metadata": {"tag": "drop"}}', + }, + ] + ] client = FakeClient(index) store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) - docs = store.similarity_search("q", k=5, filter={"tag": "keep"}, score_threshold=0.5) + docs = store.similarity_search( + "q", k=5, filter={"tag": "keep"}, score_threshold=0.5 + ) assert len(docs) == 1 assert docs[0].page_content == "A" assert docs[0].metadata["_score"] >= 0.5 @@ -52,9 +69,15 @@ def test_similarity_search_with_filter_and_threshold(): def test_similarity_search_handles_string_metadata(): index = FakeIndex() # metadata returned as a single JSON string instead of list - index.search_payload = [[ - {"id": "pos-0", "score": 0.8, "metadata": "{\"text\": \"S\", \"metadata\": {\"t\": 1}}"}, - ]] + index.search_payload = [ + [ + { + "id": "pos-0", + "score": 0.8, + "metadata": '{"text": "S", "metadata": {"t": 1}}', + }, + ] + ] client = FakeClient(index) store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) @@ -67,9 +90,15 @@ def test_similarity_search_handles_string_metadata(): def test_similarity_search_uses_raw_text_when_not_json(): index = FakeIndex() # metadata is a plain string (not JSON); should be treated as page_content - index.search_payload = [[ - {"id": "pos-raw", "score": 0.6, "metadata": "Plain text content without JSON"}, - ]] + index.search_payload = [ + [ + { + "id": "pos-raw", + "score": 0.6, + "metadata": "Plain text content without JSON", + }, + ] + ] client = FakeClient(index) store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) @@ -77,15 +106,19 @@ def test_similarity_search_uses_raw_text_when_not_json(): assert len(docs) == 1 assert docs[0].page_content == "Plain text content without JSON" # user metadata should be empty dict when not provided - assert all(k in docs[0].metadata for k in ["_score", "_id"]) # only system fields present + assert all( + k in docs[0].metadata for k in ["_score", "_id"] + ) # only system fields present def test_similarity_search_handles_python_literal_metadata(): index = FakeIndex() literal = str({"text": "Literal", "metadata": {"tag": "py"}}) - index.search_payload = [[ - {"id": "pos-lit", "score": 0.7, "metadata": literal}, - ]] + index.search_payload = [ + [ + {"id": "pos-lit", "score": 0.7, "metadata": literal}, + ] + ] client = FakeClient(index) store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) @@ -94,21 +127,32 @@ def test_similarity_search_handles_python_literal_metadata(): assert docs[0].page_content == "Literal" assert docs[0].metadata.get("tag") == "py" - # dict-type metadata is not supported currently; only text-based def test_similarity_search_by_vector_with_filter_and_threshold(): index = FakeIndex() - index.search_payload = [[ - {"id": "v-0", "score": 0.88, "metadata": "{\"text\": \"Keep\", \"metadata\": {\"k\": 1}}"}, - {"id": "v-1", "score": 0.30, "metadata": "{\"text\": \"Drop\", \"metadata\": {\"k\": 2}}"}, - ]] + index.search_payload = [ + [ + { + "id": "v-0", + "score": 0.88, + "metadata": '{"text": "Keep", "metadata": {"k": 1}}', + }, + { + "id": "v-1", + "score": 0.30, + "metadata": '{"text": "Drop", "metadata": {"k": 2}}', + }, + ] + ] client = FakeClient(index) store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) # Explicit vector search (bypasses embed_query), with filter/threshold - docs = store.similarity_search_by_vector([0.0, 0.0, 0.0, 0.0], k=5, filter={"k": 1}, score_threshold=0.5) + docs = store.similarity_search_by_vector( + [0.0, 0.0, 0.0, 0.0], k=5, filter={"k": 1}, score_threshold=0.5 + ) assert len(docs) == 1 assert docs[0].page_content == "Keep" assert docs[0].metadata["_score"] >= 0.5 @@ -136,13 +180,15 @@ def test_from_documents_paths_through_to_texts(): LC_Document(page_content="X", metadata={"a": 1}), LC_Document(page_content="Y", metadata={"a": 2}), ] - store = Envector.from_documents(docs, embeddings=FakeEmbeddings(dim=4), config=_cfg(), client=client) + store = Envector.from_documents( + docs, embeddings=FakeEmbeddings(dim=4), config=_cfg(), client=client + ) assert isinstance(store, Envector) assert len(client.index.inserted) == 1 packed = client.index.inserted[0]["metadata"] # Texts preserved - assert any("\"text\": \"X\"" in m for m in packed) - assert any("\"text\": \"Y\"" in m for m in packed) + assert any('"text": "X"' in m for m in packed) + assert any('"text": "Y"' in m for m in packed) def test_add_documents_with_embeddings(): @@ -157,8 +203,8 @@ def test_add_documents_with_embeddings(): assert len(ret) == 2 assert len(client.index.inserted) == 1 packed = client.index.inserted[0]["metadata"] - assert any("\"text\": \"C1\"" in m for m in packed) - assert any("\"text\": \"C2\"" in m for m in packed) + assert any('"text": "C1"' in m for m in packed) + assert any('"text": "C2"' in m for m in packed) def test_add_documents_requires_vectors_when_no_embeddings(): @@ -167,7 +213,9 @@ def test_add_documents_requires_vectors_when_no_embeddings(): docs = [LC_Document(page_content="C", metadata={})] try: store.add_documents(docs) - assert False, "Expected ValueError when embeddings is None and no vectors provided" + assert ( + False + ), "Expected ValueError when embeddings is None and no vectors provided" except ValueError as e: assert "embeddings is None and vectors not provided" in str(e) From a1d89c632bf5cdc4f33e8bba09e618eedf8f4592 Mon Sep 17 00:00:00 2001 From: Jungjoo Seo <115966721+inkme9@users.noreply.github.com> Date: Sun, 12 Oct 2025 23:08:16 +0900 Subject: [PATCH 04/13] Update runs-on --- .github/workflows/pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 33e75a2..7d4ab9b 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -11,7 +11,7 @@ concurrency: jobs: checks: - runs-on: ubuntu-latest + runs-on: [self-hosted, linux, no-gpu] steps: - name: Checkout repository From b7171b0e8f1d9f1b9ddafd13521f0dbc5d467847 Mon Sep 17 00:00:00 2001 From: inkme Date: Sun, 12 Oct 2025 23:53:49 +0000 Subject: [PATCH 05/13] ES2-979: add score-returning similarity search APIs --- .../langchain_envector/vectorstore.py | 106 ++++++++++++++---- tests/test_vectorstore.py | 52 +++++++++ 2 files changed, 137 insertions(+), 21 deletions(-) diff --git a/libs/envector/langchain_envector/vectorstore.py b/libs/envector/langchain_envector/vectorstore.py index 0350af9..eef1cb5 100644 --- a/libs/envector/langchain_envector/vectorstore.py +++ b/libs/envector/langchain_envector/vectorstore.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple from .config import EnvectorConfig from .client import EnvectorClient from .types import Embeddings, as_embeddings, pack_metadata, unpack_metadata @@ -96,27 +96,16 @@ def add_texts( # but they are NOT persisted/addressable. return result_ids - def similarity_search( + def _similarity_search_with_scores( self, - query: str, - k: int = 4, *, + embedding: List[float], + k: int, filter: Optional[Dict[str, Any]] = None, score_threshold: Optional[float] = None, fetch_k: Optional[int] = None, **kwargs: Any, - ) -> List[Document]: - """Search similar items for a text query. - - - Embeds query if embeddings are provided; else expect `embedding` kwarg. - - Applies optional client-side filter and score threshold. - """ - embedding: Optional[List[float]] = kwargs.get("embedding") - if embedding is None: - if self._embeddings is None: - raise ValueError("embeddings is None and no `embedding` provided") - embedding = self._embeddings.embed_query(query) - + ) -> List[Tuple[Document, float]]: top_k = fetch_k or self.config.index.fetch_k or k results = self.client.index.search( @@ -129,7 +118,7 @@ def similarity_search( else results ) - docs = [] + docs_with_scores: List[Tuple[Document, float]] = [] # Iterate from top-1 to top-k for item in result: # item = {"id": ..., "score": float, "metadata": [str] or {...}} @@ -158,10 +147,66 @@ def similarity_search( page_content=text, metadata={**metadata, "_score": score, "_id": item.get("id")}, ) - docs.append(doc) + docs_with_scores.append((doc, score)) # Trim to k after filtering - return docs[:k] + return docs_with_scores[:k] + + def similarity_search( + self, + query: str, + k: int = 4, + *, + filter: Optional[Dict[str, Any]] = None, + score_threshold: Optional[float] = None, + fetch_k: Optional[int] = None, + **kwargs: Any, + ) -> List[Document]: + """Search similar items for a text query. + + - Embeds query if embeddings are provided; else expect `embedding` kwarg. + - Applies optional client-side filter and score threshold. + """ + embedding: Optional[List[float]] = kwargs.pop("embedding", None) + if embedding is None: + if self._embeddings is None: + raise ValueError("embeddings is None and no `embedding` provided") + embedding = self._embeddings.embed_query(query) + + docs_with_scores = self._similarity_search_with_scores( + embedding=embedding, + k=k, + filter=filter, + score_threshold=score_threshold, + fetch_k=fetch_k, + **kwargs, + ) + return [doc for doc, _ in docs_with_scores] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + *, + filter: Optional[Dict[str, Any]] = None, + score_threshold: Optional[float] = None, + fetch_k: Optional[int] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + embedding: Optional[List[float]] = kwargs.pop("embedding", None) + if embedding is None: + if self._embeddings is None: + raise ValueError("embeddings is None and no `embedding` provided") + embedding = self._embeddings.embed_query(query) + + return self._similarity_search_with_scores( + embedding=embedding, + k=k, + filter=filter, + score_threshold=score_threshold, + fetch_k=fetch_k, + **kwargs, + ) # Vector-based variant required by some VectorStore interfaces def similarity_search_by_vector( @@ -174,13 +219,32 @@ def similarity_search_by_vector( fetch_k: Optional[int] = None, **kwargs: Any, ) -> List[Document]: - return self.similarity_search( - query="", # unused + docs_with_scores = self._similarity_search_with_scores( + embedding=embedding, k=k, filter=filter, score_threshold=score_threshold, fetch_k=fetch_k, + **kwargs, + ) + return [doc for doc, _ in docs_with_scores] + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + *, + filter: Optional[Dict[str, Any]] = None, + score_threshold: Optional[float] = None, + fetch_k: Optional[int] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + return self._similarity_search_with_scores( embedding=embedding, + k=k, + filter=filter, + score_threshold=score_threshold, + fetch_k=fetch_k, **kwargs, ) diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py index b572ee5..8178c52 100644 --- a/tests/test_vectorstore.py +++ b/tests/test_vectorstore.py @@ -158,6 +158,58 @@ def test_similarity_search_by_vector_with_filter_and_threshold(): assert docs[0].metadata["_score"] >= 0.5 +def test_similarity_search_with_score_returns_tuples(): + index = FakeIndex() + index.search_payload = [ + [ + { + "id": "s-0", + "score": 0.77, + "metadata": '{"text": "Doc0", "metadata": {"tag": "x"}}', + }, + { + "id": "s-1", + "score": 0.25, + "metadata": '{"text": "Doc1", "metadata": {"tag": "y"}}', + }, + ] + ] + client = FakeClient(index) + store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) + + results = store.similarity_search_with_score("query", k=2) + assert len(results) == 2 + first_doc, first_score = results[0] + assert isinstance(first_doc, LC_Document) + assert first_doc.page_content == "Doc0" + assert first_doc.metadata["_score"] == first_score + assert first_doc.metadata["_id"] == "s-0" + + +def test_similarity_search_with_score_by_vector_returns_tuples(): + index = FakeIndex() + index.search_payload = [ + [ + { + "id": "sv-0", + "score": 0.66, + "metadata": '{"text": "VectorDoc", "metadata": {"tag": "keep"}}', + } + ] + ] + client = FakeClient(index) + store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) + + results = store.similarity_search_with_score_by_vector( + [0.0, 0.0, 0.0, 0.0], k=1, filter={"tag": "keep"}, score_threshold=0.5 + ) + assert len(results) == 1 + doc, score = results[0] + assert doc.page_content == "VectorDoc" + assert score == doc.metadata["_score"] + assert doc.metadata["_id"] == "sv-0" + + def test_from_texts_inserts_using_embeddings(): client = FakeClient() store = Envector.from_texts( From 78d2ce24d0c2be107a538dd222a174f71dbc1d25 Mon Sep 17 00:00:00 2001 From: suyeong Date: Thu, 13 Nov 2025 06:01:44 +0000 Subject: [PATCH 06/13] fix readme --- README.md | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7a231ae..9a616b3 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Encrypted vector search for LangChain using Envector (ES2), powered by homomorph - `python3.11 -m venv .venv && source .venv/bin/activate` - Install runtime dependencies: - `pip install -U pip setuptools wheel` - - `pip install es2==1.1.0rc2 langchain sentence-transformers` + - `pip install es2 langchain sentence-transformers` ## Usage Overview 1. Configure Envector using `EnvectorConfig`, pointing to your ES2 endpoint and keys. @@ -27,7 +27,7 @@ Encrypted vector search for LangChain using Envector (ES2), powered by homomorph Key dataclasses live in `libs/envector/config.py`: - `ConnectionConfig`: address or host/port for ES2. - `KeyConfig`: key path, key ID, optional preset/eval mode. -- `IndexSettings`: index name, dimension (16–4096), query encryption mode, optional output fields and fetch parameters. +- `IndexSettings`: index name, dimension (32–4096), query encryption mode, optional output fields and fetch parameters. - `EnvectorConfig`: wraps the above and enables auto-creation via `create_if_missing`. ## Data Model @@ -42,15 +42,42 @@ Key dataclasses live in `libs/envector/config.py`: - Filtering happens client-side; ensure metadata is JSON for structured filters. ## Examples +- Configuration + + ```python + from langchain_envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig + + cfg = EnvectorConfig( + connection=ConnectionConfig(address=ES2_ADDRESS, access_token=ES2_ACCESS_TOKEN), + key=KeyConfig(key_path=ES2_KEY_PATH, key_id=ES2_KEY_ID, preset="ip", eval_mode="rmp"), + index=IndexSettings(index_name=INDEX_NAME, dim=DIM), + create_if_missing=True, + ) + ``` + - Add documents (from LangChain Documents): - - Python - - from langchain_core.documents import Document - - docs = [ - Document(page_content="chunk-1", metadata={"source": "paper.pdf", "page": 1, "chunk": 0}), - Document(page_content="chunk-2", metadata={"source": "paper.pdf", "page": 1, "chunk": 1}), - ] - - store = Envector(config=cfg, embeddings=emb) - - store.add_documents(docs) + + ```python + from langchain_core.documents import Document + from langchain_envector.vectorstore import Envector + + docs = [Document(page_content="chunk-1", metadata={"source": "doc.pdf", "page": 1, "chunk": 0})] + + store = Envector(config=cfg, embeddings=emb) + store.add_documents(docs) + ``` + + The method `add_texts` is also available to store texts. + +- Similarity search + + ```python + results = store.similarity_search_with_score(query, k=3) + for doc, score in results: + print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]") + ``` + + The methods `similarity_search` and `similarity_search_with_vector` are also available to perform vector search. ## Troubleshooting - Connection issues: verify ES2 address and registered keys. From 09277ae94ef5dcc705a7355560d4e096eb58dbb3 Mon Sep 17 00:00:00 2001 From: suyeong Date: Thu, 13 Nov 2025 06:06:48 +0000 Subject: [PATCH 07/13] fix readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9a616b3..20e4362 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Key dataclasses live in `libs/envector/config.py`: print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]") ``` - The methods `similarity_search` and `similarity_search_with_vector` are also available to perform vector search. + The methods `similarity_search` and `similarity_search_with_vector` (with `embeddings.embed_query()`) are also available to perform vector search. ## Troubleshooting - Connection issues: verify ES2 address and registered keys. From f5351b7dc95ea72206b7d3e7ba102bfbf003987d Mon Sep 17 00:00:00 2001 From: suyeong Date: Thu, 13 Nov 2025 06:17:03 +0000 Subject: [PATCH 08/13] update pytest in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 20e4362..14fa220 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ Key dataclasses live in `libs/envector/config.py`: ## Testing Without ES2 - Run unit tests offline (no ES2 or SDK required): - `python -m pytest -q -m "not integration"` - - or `python run_unit_tests.py` + - or `python scripts/run_unit_tests.py` - Run integration tests (requires server and keys): - Export `ES2_ADDRESS`, `ES2_KEY_PATH`, `ES2_KEY_ID` - Optional: `ES2_USE_EMBEDDINGS=1`, `ES2_EMB_MODEL`, `ES2_USE_HF_DATASET=1` From 057c8bb231066d441cb53a674b6b1b0313373c39 Mon Sep 17 00:00:00 2001 From: suyeong Date: Thu, 13 Nov 2025 06:17:34 +0000 Subject: [PATCH 09/13] rm debug log? --- libs/envector/langchain_envector/types.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/libs/envector/langchain_envector/types.py b/libs/envector/langchain_envector/types.py index 5ad4591..286a24c 100644 --- a/libs/envector/langchain_envector/types.py +++ b/libs/envector/langchain_envector/types.py @@ -57,8 +57,6 @@ def unpack_metadata(raw: Any) -> Dict[str, Any]: if isinstance(raw, dict): return raw - print("slafjklshglkhslafhlksadjlghsal;hf") - # Some responses wrap the payload in a single-element list. if isinstance(raw, (list, tuple)): if len(raw) == 1: From 430cb45dc13134dc235b403832c62188e146a4a8 Mon Sep 17 00:00:00 2001 From: suyeong Date: Thu, 13 Nov 2025 06:18:23 +0000 Subject: [PATCH 10/13] fix python tests path in unit test script --- scripts/run_unit_tests.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/run_unit_tests.py b/scripts/run_unit_tests.py index 3370e56..489698b 100644 --- a/scripts/run_unit_tests.py +++ b/scripts/run_unit_tests.py @@ -2,7 +2,13 @@ import importlib import inspect +import sys import traceback +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) def run_module_tests(module_name: str) -> list[tuple[str, bool, str]]: From 16ddaeb2be488f69c583f77a49ecbf181b5b6d81 Mon Sep 17 00:00:00 2001 From: suyeong Date: Thu, 13 Nov 2025 06:18:54 +0000 Subject: [PATCH 11/13] fix minimum dimension in pytest --- tests/integration/test_es2_integration.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_es2_integration.py b/tests/integration/test_es2_integration.py index a2bf967..03aeb81 100644 --- a/tests/integration/test_es2_integration.py +++ b/tests/integration/test_es2_integration.py @@ -73,10 +73,10 @@ def test_e2e_vectorstore_plain_and_cipher(): except Exception as e: pytest.skip(f"Embeddings requested but unavailable: {e}") else: - dim = int(dim_env or "16") + dim = int(dim_env or "32") - if dim < 16 or dim > 4096: - pytest.skip("Envector supports dimensions in [16, 4096]") + if dim < 32 or dim > 4096: + pytest.skip("Envector supports dimensions in [32, 4096]") base_index_name = os.environ.get( "ES2_INDEX_NAME", f"inttest_{secrets.token_hex(4)}" From ef527eaa92692977248e980f8351b4a7c1c3befc Mon Sep 17 00:00:00 2001 From: suyeong Date: Mon, 8 Dec 2025 00:59:45 +0000 Subject: [PATCH 12/13] update renamed sdk --- .github/workflows/release.yaml | 2 +- .gitignore | 4 +- CONTRIBUTE.md | 8 +-- README.md | 28 +++++----- libs/envector/README.md | 6 +-- libs/envector/examples/basic_usage.py | 2 +- libs/envector/examples/ingest_synthetic_1k.py | 2 +- libs/envector/langchain_envector/__init__.py | 2 +- libs/envector/langchain_envector/client.py | 32 ++++++------ libs/envector/langchain_envector/types.py | 8 +-- .../langchain_envector/vectorstore.py | 12 ++--- pyproject.toml | 8 +-- pytest.ini | 2 +- tests/integration/test_es2_integration.py | 52 +++++++++---------- tests/requirements.txt | 4 +- 15 files changed, 85 insertions(+), 87 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 24705f5..78e880d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -192,7 +192,7 @@ jobs: MAX_ATTEMPTS=30 SLEEP_SECONDS=10 - echo "Attempting to install es2==${WHEEL_VERSION} from TestPyPI..." + echo "Attempting to install pyenvector==${WHEEL_VERSION} from TestPyPI..." ATTEMPTS=0 while true; do ATTEMPTS=$((ATTEMPTS + 1)) diff --git a/.gitignore b/.gitignore index eca98cc..cfda40f 100644 --- a/.gitignore +++ b/.gitignore @@ -39,10 +39,8 @@ keys/ VECTORSTORE.md # External symlinks (local workspace references) -es2-msa es2-msa/ -es2-deploy -es2-deploy/ +envector-deployment/ # Local helper scripts run_unit_tests.py diff --git a/CONTRIBUTE.md b/CONTRIBUTE.md index a2e8f13..6007c29 100644 --- a/CONTRIBUTE.md +++ b/CONTRIBUTE.md @@ -13,16 +13,16 @@ Thanks for your interest in improving the project! This guide covers local setup ## Testing - **Unit tests** (fakes only): `python run_unit_tests.py` -- **Integration tests** (requires ES2 server + keys): - - Export `ES2_ADDRESS`, `ES2_KEY_PATH`, `ES2_KEY_ID` - - Optional: `ES2_USE_EMBEDDINGS=1`, `ES2_EMB_MODEL`, `ES2_USE_HF_DATASET=1` +- **Integration tests** (requires EnVector server + keys): + - Export `ENVECTOR_ADDRESS`, `ENVECTOR_KEY_PATH`, `ENVECTOR_KEY_ID` + - Optional: `ENVECTOR_USE_EMBEDDINGS=1`, `ENVECTOR_EMB_MODEL`, `ENVECTOR_USE_HF_DATASET=1` - Run `pytest -m integration -s` Please run relevant tests before submitting a PR and mention coverage in the description. ## Development Guidelines - Keep code, comments, and docs in English. -- Prefer the high-level `es2` SDK APIs; avoid direct gRPC/indexer calls unless required. +- Prefer the high-level `pyenvector` SDK APIs; avoid direct gRPC/indexer calls unless required. - Keep changes focused and documented; update README or notebooks when behavior changes. - Follow existing formatting and type-hint conventions. diff --git a/README.md b/README.md index ac6fea8..393e18d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # LangChain Envector Integration -Encrypted vector search for LangChain using Envector (ES2), powered by homomorphic encryption (CKKS). This repo ships a LangChain-compatible VectorStore and retriever utilities built on the high-level `es2` Python SDK. +Encrypted vector search for LangChain using Envector, powered by homomorphic encryption (CKKS). This repo ships a LangChain-compatible VectorStore and retriever utilities built on the high-level `pyenvector` Python SDK. ## Features - LangChain `VectorStore` interface with `similarity_search`, `from_texts`, etc. @@ -13,10 +13,10 @@ Encrypted vector search for LangChain using Envector (ES2), powered by homomorph - `python3.11 -m venv .venv && source .venv/bin/activate` - Install runtime dependencies: - `pip install -U pip setuptools wheel` - - `pip install es2 langchain sentence-transformers` + - `pip install pyenvector langchain sentence-transformers` ## Usage Overview -1. Configure Envector using `EnvectorConfig`, pointing to your ES2 endpoint and keys. +1. Configure Envector using `EnvectorConfig`, pointing to your EnVector endpoint and keys. 2. Initialize embeddings (or provide pre-computed vectors). 3. Instantiate `Envector(config=cfg, embeddings=emb)` and call `add_texts`, `add_documents`, or use `as_retriever`. 4. Run `similarity_search` or plug the retriever into your LangChain pipeline. @@ -25,13 +25,13 @@ Encrypted vector search for LangChain using Envector (ES2), powered by homomorph ## Configuration Key dataclasses live in `libs/envector/config.py`: -- `ConnectionConfig`: address or host/port for ES2. +- `ConnectionConfig`: address or host/port for EnVector. - `KeyConfig`: key path, key ID, optional preset/eval mode. - `IndexSettings`: index name, dimension (32–4096), query encryption mode, optional output fields and fetch parameters. - `EnvectorConfig`: wraps the above and enables auto-creation via `create_if_missing`. ## Data Model -- Each vector stores a single `metadata` string in ES2. +- Each vector stores a single `metadata` string in EnVector. - To align with LangChain’s `Document`, inserts wrap data as JSON: `{"text": ..., "metadata": ...}`. - Retrieval unwraps JSON, returning `Document(page_content=text, metadata={...})`. - Client-side filtering requires the JSON envelope to include an object under `metadata`. @@ -48,12 +48,12 @@ Key dataclasses live in `libs/envector/config.py`: cfg = EnvectorConfig( connection=ConnectionConfig( - address=ES2_ADDRESS, - access_token=ES2_ACCESS_TOKEN + address=ENVECTOR_ADDRESS, + access_token=ENVECTOR_ACCESS_TOKEN ), key=KeyConfig( - key_path=ES2_KEY_PATH, - key_id=ES2_KEY_ID, + key_path=ENVECTOR_KEY_PATH, + key_id=ENVECTOR_KEY_ID, preset="ip", eval_mode="rmp" ), @@ -100,18 +100,18 @@ Key dataclasses live in `libs/envector/config.py`: The methods `similarity_search` and `similarity_search_with_vector` (with `embeddings.embed_query()`) are also available to perform vector search. ## Troubleshooting -- Connection issues: verify ES2 address and registered keys. +- Connection issues: verify EnVector address and registered keys. - Embeddings mismatch: ensure embedding dimension equals `index.dim` when supplying vectors. - Unexpected raw strings: confirm inserts used the JSON envelope. - Key Issues: check key's metadata to sync with the registered key if facing any key issue. -## Testing Without ES2 -- Run unit tests offline (no ES2 or SDK required): +## Testing Without EnVector +- Run unit tests offline (no EnVector or SDK required): - `python -m pytest -q -m "not integration"` - or `python scripts/run_unit_tests.py` - Run integration tests (requires server and keys): - - Export `ES2_ADDRESS`, `ES2_KEY_PATH`, `ES2_KEY_ID` - - Optional: `ES2_USE_EMBEDDINGS=1`, `ES2_EMB_MODEL`, `ES2_USE_HF_DATASET=1` + - Export `ENVECTOR_ADDRESS`, `ENVECTOR_KEY_PATH`, `ENVECTOR_KEY_ID` + - Optional: `ENVECTOR_USE_EMBEDDINGS=1`, `ENVECTOR_EMB_MODEL`, `ENVECTOR_USE_HF_DATASET=1` - `python -m pytest -q -m integration -s` ## Contributing diff --git a/libs/envector/README.md b/libs/envector/README.md index 0ff389d..a9323f2 100644 --- a/libs/envector/README.md +++ b/libs/envector/README.md @@ -1,15 +1,15 @@ # Envector (LangChain VectorStore) -High-level VectorStore adaptor for Envector (ES2), using the `es2` SDK. Vectors are always encrypted on the server; the SDK performs required crypto client-side. +High-level VectorStore adaptor for Envector, using the `pyenvector` SDK. Vectors are always encrypted on the server; the SDK performs required crypto client-side. Key points -- Use high-level `es2.ES2` and `es2.Index`; avoid low-level `es2.api.Indexer`/gRPC. +- Use high-level `pyenvector.EnvectorClient` and `pyenvector.Index`; avoid low-level `pyenvector.api.Indexer`/gRPC. - Index encryption is fixed to `cipher`. Query can be `plain` or `cipher`. - Metadata is stored as a single JSON string per item: `{id, text, metadata}`. Files - `config.py`: Configuration dataclasses (connection, key, index). -- `client.py`: Initializes ES2 + index and returns an `Index` instance. +- `client.py`: Initializes EnVector + index and returns an `Index` instance. - `vectorstore.py`: `Envector` VectorStore implementation. - `retriever.py`: Optional wrapper retriever. - `examples/`: Minimal examples. diff --git a/libs/envector/examples/basic_usage.py b/libs/envector/examples/basic_usage.py index d70b61b..88eeb47 100644 --- a/libs/envector/examples/basic_usage.py +++ b/libs/envector/examples/basic_usage.py @@ -1,7 +1,7 @@ """Basic usage example for Envector VectorStore. Requirements: -- `es2` +- `pyenvector` - `langchain` (version providing VectorStore APIs) - An embeddings backend, e.g. sentence-transformers """ diff --git a/libs/envector/examples/ingest_synthetic_1k.py b/libs/envector/examples/ingest_synthetic_1k.py index 8b055c8..ec92a7d 100644 --- a/libs/envector/examples/ingest_synthetic_1k.py +++ b/libs/envector/examples/ingest_synthetic_1k.py @@ -1,7 +1,7 @@ """Ingest the synthetic 1K dataset into Envector. Requires: -- ES2 server and keys. +- EnVector server and keys. - Dataset at `data/synthetic_rag_1k.jsonl` (run scripts/make_synthetic_rag_dataset.py). Usage: diff --git a/libs/envector/langchain_envector/__init__.py b/libs/envector/langchain_envector/__init__.py index cf7a9b6..0567c2e 100644 --- a/libs/envector/langchain_envector/__init__.py +++ b/libs/envector/langchain_envector/__init__.py @@ -1,6 +1,6 @@ """Envector LangChain integration package. -Provides a LangChain-compatible VectorStore that wraps the high-level `es2` SDK. +Provides a LangChain-compatible VectorStore that wraps the high-level `pyenvector` SDK. All code and comments are in English as per project rules. """ diff --git a/libs/envector/langchain_envector/client.py b/libs/envector/langchain_envector/client.py index c3ac2f9..930913d 100644 --- a/libs/envector/langchain_envector/client.py +++ b/libs/envector/langchain_envector/client.py @@ -4,45 +4,45 @@ class EnvectorClient: - """Thin convenience client around the high-level `es2` SDK. + """Thin convenience client around the high-level `pyenvector` SDK. - Establishes a connection - Initializes key and index configuration - Optionally creates the index if missing - - Provides access to the ES2 `Index` instance + - Provides access to the envector `Index` instance """ def __init__(self, config: EnvectorConfig): self.config = config - self._es2 = None + self._ev = None self._index = None def init(self): - import es2 + import pyenvector as ev c = self.config.connection k = self.config.key i = self.config.index - es2_client = es2.ES2() + ev_client = ev.EnvectorClient() # Connection if c.address: - es2_client.init_connect(address=c.address, access_token=c.access_token) + ev_client.init_connect(address=c.address, access_token=c.access_token) else: if not (c.host and c.port): raise ValueError("Either address or host+port must be provided.") - es2_client.init_connect( + ev_client.init_connect( host=c.host, port=c.port, access_token=c.access_token ) # Key path baseline for Index - from es2.index import Index as _Index + from pyenvector.index import Index as _Index _Index.init_key_path(k.key_path) # Index config + key setup - es2_client.init_index_config( + ev_client.init_index_config( index_name=i.index_name, dim=i.dim, key_path=k.key_path, @@ -59,13 +59,13 @@ def init(self): # Create index if missing if self.config.create_if_missing: - idx_list = es2_client.get_index_list() + idx_list = ev_client.get_index_list() if i.index_name not in idx_list: - es2_client.create_index(index_name=i.index_name, dim=i.dim) + ev_client.create_index(index_name=i.index_name, dim=i.dim) # Bind index instance - self._index = es2.Index(i.index_name) - self._es2 = es2_client + self._index = ev.Index(i.index_name) + self._ev = ev_client return self @property @@ -75,7 +75,7 @@ def index(self): return self._index @property - def es2(self): - if self._es2 is None: + def ev(self): + if self._ev is None: raise RuntimeError("Client not initialized. Call init().") - return self._es2 + return self._ev diff --git a/libs/envector/langchain_envector/types.py b/libs/envector/langchain_envector/types.py index 286a24c..99d4a73 100644 --- a/libs/envector/langchain_envector/types.py +++ b/libs/envector/langchain_envector/types.py @@ -29,9 +29,9 @@ class SearchResult: def pack_metadata(text: str, metadata: Optional[Dict[str, Any]] = None) -> str: - """Pack text and metadata into a single JSON string field accepted by ES2. + """Pack text and metadata into a single JSON string field accepted by pyenvector. - ES2 metadata API stores lists of strings; we store a single JSON blob per item. + pyenvector metadata API stores lists of strings; we store a single JSON blob per item. Item-level IDs are not persisted/addressable. """ import json @@ -46,7 +46,7 @@ def pack_metadata(text: str, metadata: Optional[Dict[str, Any]] = None) -> str: def unpack_metadata(raw: Any) -> Dict[str, Any]: """Return metadata as a dict regardless of the raw payload type. - Recent ES2 versions may return decrypted metadata as a Python dict instead + Recent pyenvector versions may return decrypted metadata as a Python dict instead of the JSON string we originally stored. We normalise the payload here so downstream code always works with a dictionary. """ @@ -79,7 +79,7 @@ def unpack_metadata(raw: Any) -> Dict[str, Any]: if isinstance(data, dict): return data except Exception: - # Some ES2 responses return Python-literal strings (single quotes). + # Some pyenvector responses return Python-literal strings (single quotes). try: import ast diff --git a/libs/envector/langchain_envector/vectorstore.py b/libs/envector/langchain_envector/vectorstore.py index eef1cb5..67bccbf 100644 --- a/libs/envector/langchain_envector/vectorstore.py +++ b/libs/envector/langchain_envector/vectorstore.py @@ -38,10 +38,10 @@ def __init__( class Envector(VectorStore): # type: ignore[misc] - """LangChain-compatible VectorStore adaptor for Envector (ES2). + """LangChain-compatible VectorStore adaptor for Envector. - This class wraps the high-level `es2` SDK. It does not use low-level - gRPC stubs or `es2.api.Indexer` directly. + This class wraps the high-level `pyenvector` SDK. It does not use low-level + gRPC stubs or `pyenvector.api.Indexer` directly. """ def __init__( @@ -89,7 +89,7 @@ def add_texts( # Prepare metadata JSON strings per item packed = [pack_metadata(t, m) for t, m in zip(texts, metadatas)] - # Insert using high-level ES2 Index + # Insert using high-level pyenvector Index result_ids = self.client.index.insert(data=vectors, metadata=packed) # Return ephemeral placeholders to satisfy VectorStore interface, @@ -111,7 +111,7 @@ def _similarity_search_with_scores( results = self.client.index.search( query=embedding, top_k=top_k, output_fields=self.config.index.output_fields ) - # ES2 Index.search returns a list for each query; we passed single query + # pyenvector Index.search returns a list for each query; we passed single query result = ( results[0] if isinstance(results, list) and results and isinstance(results[0], list) @@ -265,7 +265,7 @@ def add_documents( extracting `page_content` and `metadata` from each Document. Notes: - - Manual `ids` are ignored (ES2 does not support user-provided IDs). + - Manual `ids` are ignored (EnVector does not support user-provided IDs). - When `embeddings` is not configured, you must supply `vectors`. - Returns ephemeral IDs as produced by the client insert. """ diff --git a/pyproject.toml b/pyproject.toml index 2e21520..acb83b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,8 +7,8 @@ build-backend = "setuptools.build_meta" [project] name = "langchain-envector" -version = "0.1.2" -description = "LangChain VectorStore integration for Envector (ES2) encrypted vector search" +version = "0.1.3" +description = "LangChain VectorStore integration for Envector" readme = "README.md" license = {text = "MIT"} requires-python = ">=3.9,<3.14" @@ -16,10 +16,10 @@ authors = [ { name = "Envector Contributors" } ] dependencies = [ - "es2", + "pyenvector", "langchain>=0.2.0", ] -keywords = ["langchain", "vectorstore", "homomorphic-encryption", "ckks", "encrypted-search", "envector", "es2"] +keywords = ["langchain", "vectorstore", "homomorphic-encryption", "ckks", "encrypted-search", "envector", "pyenvector"] classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", diff --git a/pytest.ini b/pytest.ini index 6d12c43..70b79b9 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,6 +1,6 @@ [pytest] markers = - integration: tests that require a running ES2 server and the real es2 SDK + integration: tests that require a running EnVector server and the real EnVector SDK testpaths = tests diff --git a/tests/integration/test_es2_integration.py b/tests/integration/test_es2_integration.py index 8f5fca1..8a62b30 100644 --- a/tests/integration/test_es2_integration.py +++ b/tests/integration/test_es2_integration.py @@ -25,35 +25,35 @@ def _require_env(name: str) -> str: @pytest.mark.skipif( - os.environ.get("ES2_ADDRESS") is None, - reason="Set ES2_ADDRESS (e.g., 0.0.0.0:50050) to enable ES2 integration tests", + os.environ.get("ENVECTOR_ADDRESS") is None, + reason="Set ENVECTOR_ADDRESS (e.g., 0.0.0.0:50050) to enable Envector integration tests", ) def test_e2e_vectorstore_plain_and_cipher(): try: - import es2 # type: ignore + import pyenvector # type: ignore except Exception as e: # pragma: no cover - env-dependent - pytest.skip(f"es2 SDK not available: {e}") + pytest.skip(f"pyenvector SDK not available: {e}") - address = _require_env("ES2_ADDRESS") - key_path = _require_env("ES2_KEY_PATH") - key_id = _require_env("ES2_KEY_ID") - use_emb = os.environ.get("ES2_USE_EMBEDDINGS") in {"1", "true", "TRUE", "yes"} + address = _require_env("ENVECTOR_ADDRESS") + key_path = _require_env("ENVECTOR_KEY_PATH") + key_id = _require_env("ENVECTOR_KEY_ID") + use_emb = os.environ.get("ENVECTOR_USE_EMBEDDINGS") in {"1", "true", "TRUE", "yes"} model_name = os.environ.get( - "ES2_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2" + "ENVECTOR_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2" ) - use_hf = os.environ.get("ES2_USE_HF_DATASET") in {"1", "true", "TRUE", "yes"} - hf_name = os.environ.get("ES2_HF_NAME", "ag_news") - hf_subset = os.environ.get("ES2_HF_SUBSET") - hf_split = os.environ.get("ES2_HF_SPLIT", "train") - hf_text_col = os.environ.get("ES2_HF_TEXT_COL", "text") + use_hf = os.environ.get("ENVECTOR_USE_HF_DATASET") in {"1", "true", "TRUE", "yes"} + hf_name = os.environ.get("ENVECTOR_HF_NAME", "ag_news") + hf_subset = os.environ.get("ENVECTOR_HF_SUBSET") + hf_split = os.environ.get("ENVECTOR_HF_SPLIT", "train") + hf_text_col = os.environ.get("ENVECTOR_HF_TEXT_COL", "text") hf_meta_cols = [ - c for c in os.environ.get("ES2_HF_META_COLS", "label").split(",") if c + c for c in os.environ.get("ENVECTOR_HF_META_COLS", "label").split(",") if c ] - hf_size = int(os.environ.get("ES2_HF_SIZE", "200")) - hf_seed = int(os.environ.get("ES2_HF_SEED", "42")) + hf_size = int(os.environ.get("ENVECTOR_HF_SIZE", "200")) + hf_seed = int(os.environ.get("ENVECTOR_HF_SEED", "42")) # Determine dimension: either from env, or from embeddings model, or default - dim_env = os.environ.get("ES2_DIM") + dim_env = os.environ.get("ENVECTOR_DIM") if use_emb: emb = None # Prefer LangChain embeddings if available, else fall back to sentence-transformers @@ -79,13 +79,13 @@ def test_e2e_vectorstore_plain_and_cipher(): pytest.skip("Envector supports dimensions in [32, 4096]") base_index_name = os.environ.get( - "ES2_INDEX_NAME", f"inttest_{secrets.token_hex(4)}" + "ENVECTOR_INDEX_NAME", f"inttest_{secrets.token_hex(4)}" ) - import es2 + import pyenvector as ev - es2.init_connect(address=address) - es2.reset() + ev.init_connect(address=address) + ev.reset() # Plain query mode cfg_plain = EnvectorConfig( @@ -205,8 +205,8 @@ def test_e2e_vectorstore_plain_and_cipher(): assert all("_id" in d.metadata for d in docs_cc) # Cleanup - store_plain.client.es2.init_connect(address=address) - store_plain.client.es2.drop_index(cfg_plain.index.index_name) + store_plain.client.ev.init_connect(address=address) + store_plain.client.ev.drop_index(cfg_plain.index.index_name) - store_cc.client.es2.init_connect(address=address) - store_cc.client.es2.drop_index(cfg_cc.index.index_name) + store_cc.client.ev.init_connect(address=address) + store_cc.client.ev.drop_index(cfg_cc.index.index_name) diff --git a/tests/requirements.txt b/tests/requirements.txt index d51cc17..02b0ade 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,9 +3,9 @@ # Test runner pytest -# ES2 SDK (encrypted vector search) — install from local wheel at repo root +# pyenvector SDK — install from local wheel at repo root # Use a direct wheel path (no PEP 508 direct reference) for maximum pip compatibility. -./es2-1.0.3rc7-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +./pyenvector-1.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl # Note: LangChain is optional for tests. Integration tests will fall back to # sentence-transformers if LangChain embeddings are unavailable. From 21f0607921c2c882834b0e3944a9a5deb73f1b76 Mon Sep 17 00:00:00 2001 From: suyeong Date: Mon, 8 Dec 2025 01:00:02 +0000 Subject: [PATCH 13/13] precommit: rm try-catch import sdk --- tests/integration/test_es2_integration.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/integration/test_es2_integration.py b/tests/integration/test_es2_integration.py index 8a62b30..a7272fd 100644 --- a/tests/integration/test_es2_integration.py +++ b/tests/integration/test_es2_integration.py @@ -29,11 +29,6 @@ def _require_env(name: str) -> str: reason="Set ENVECTOR_ADDRESS (e.g., 0.0.0.0:50050) to enable Envector integration tests", ) def test_e2e_vectorstore_plain_and_cipher(): - try: - import pyenvector # type: ignore - except Exception as e: # pragma: no cover - env-dependent - pytest.skip(f"pyenvector SDK not available: {e}") - address = _require_env("ENVECTOR_ADDRESS") key_path = _require_env("ENVECTOR_KEY_PATH") key_id = _require_env("ENVECTOR_KEY_ID")