diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 7d4ab9b..c3dc624 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -27,6 +27,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel python -m pip install -e . python -m pip install pytest pre-commit + python -m pip install langchain-tests - name: Lint and format run: pre-commit run --all-files --show-diff-on-failure diff --git a/README.md b/README.md index 393e18d..44ae6fb 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,9 @@ Key dataclasses live in `libs/envector/config.py`: - Filtering happens client-side; ensure metadata is JSON for structured filters. ## Examples -- Configuration - ```python +### Configuration + +```python from langchain_envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig cfg = EnvectorConfig( @@ -66,38 +67,63 @@ Key dataclasses live in `libs/envector/config.py`: ) ``` -- Add documents (from LangChain Documents): - - ```python - from langchain_core.documents import Document - from langchain_envector.vectorstore import Envector - - docs = [ - Document( - page_content="chunk-1", - metadata={"source": "paper.pdf", "page": 1, "chunk": 0} - ), - Document( - page_content="chunk-2", - metadata={"source": "paper.pdf", "page": 1, "chunk": 1} - ), - ] - - store = Envector(config=cfg, embeddings=emb) - store.add_documents(docs) - ``` +### Add documents (from LangChain Documents): - The method `add_texts` is also available to store texts. +```python +from langchain_core.documents import Document +from langchain_envector.vectorstore import Envector -- Similarity search +docs = [ + Document( + page_content="chunk-1", + metadata={"source": "paper.pdf", "page": 1, "chunk": 0} + ), + Document( + page_content="chunk-2", + metadata={"source": "paper.pdf", "page": 1, "chunk": 1} + ), +] - ```python - results = store.similarity_search_with_score(query, k=3) - for doc, score in results: - print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]") - ``` +store = Envector(config=cfg, embeddings=emb) +store.add_documents(docs) +``` + +Or you can use `add_texts` to store vectors and their texts. + +```python +store.add_texts( + texts=["chunk 3"], + metadatas=[{"source": "paper.pdf", "page": 1, "chunk": 2}] +) +``` + +### Similarity search + +```python +results = store.similarity_search(query, k=1) +for doc in results: + print(f"* {doc.page_content} [{doc.metadata}]") +``` + +#### Similarity Search with Score + +```python +results = store.similarity_search_with_score(query, k=1) +for doc, score in results: + print(f"* [SIM={score:.3f}] {doc.page_content} [{doc.metadata}]") +``` + + +#### Similarity Search with Vector + +```python +query_embedding = embeddings.embed_query(query) +print(f"Query: {query_embedding[:3]}") +results = store.similarity_search_by_vector(query_embedding, k=3) +for doc in results: + print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]") +``` - The methods `similarity_search` and `similarity_search_with_vector` (with `embeddings.embed_query()`) are also available to perform vector search. ## Troubleshooting - Connection issues: verify EnVector address and registered keys. @@ -105,14 +131,45 @@ Key dataclasses live in `libs/envector/config.py`: - Unexpected raw strings: confirm inserts used the JSON envelope. - Key Issues: check key's metadata to sync with the registered key if facing any key issue. -## Testing Without EnVector -- Run unit tests offline (no EnVector or SDK required): - - `python -m pytest -q -m "not integration"` - - or `python scripts/run_unit_tests.py` -- Run integration tests (requires server and keys): - - Export `ENVECTOR_ADDRESS`, `ENVECTOR_KEY_PATH`, `ENVECTOR_KEY_ID` - - Optional: `ENVECTOR_USE_EMBEDDINGS=1`, `ENVECTOR_EMB_MODEL`, `ENVECTOR_USE_HF_DATASET=1` - - `python -m pytest -q -m integration -s` +## Test + +Before running tests, install dependencies for pytest: + +```bash +pip install -r tests/requirements.txt +``` + +### Unit Test + +Run unit tests offline (no EnVector or SDK required) + +```bash +python -m pytest -q -m "not integration" +# or +python scripts/run_unit_tests.py +``` + +### Integration Test + +Run integration tests (requires enVector server) + +1. Prepare the running enVector server + +2. Export the environment variables: + + - `ENVECTOR_ADDRESS` + - `ENVECTOR_KEY_PATH` + - `ENVECTOR_KEY_ID` + - `ENVECTOR_INDEX_NAME` + - (Optional) `ENVECTOR_USE_EMBEDDINGS=1` + - (Optional) `ENVECTOR_EMB_MODEL` + - (Optional) `ENVECTOR_USE_HF_DATASET=1` + +3. Run the following command: + +```bash +python -m pytest -q -m integration -s +``` ## Contributing See [`CONTRIBUTE.md`](CONTRIBUTE.md) for development, testing, and PR guidelines. diff --git a/libs/envector/langchain_envector/vectorstore.py b/libs/envector/langchain_envector/vectorstore.py index 67bccbf..e41367f 100644 --- a/libs/envector/langchain_envector/vectorstore.py +++ b/libs/envector/langchain_envector/vectorstore.py @@ -118,12 +118,18 @@ def _similarity_search_with_scores( else results ) + if not result: + return [] + docs_with_scores: List[Tuple[Document, float]] = [] # Iterate from top-1 to top-k for item in result: # item = {"id": ..., "score": float, "metadata": [str] or {...}} score = float(item.get("score", 0.0)) md_obj_raw = item.get("metadata") + if md_obj_raw in (None, "", [], {}): + # Skip placeholder/empty hits returned by the backend. + continue # Metadata encryption/decryption is handled by the SDK. # Envector currently supports a single associated data field (string). @@ -133,6 +139,9 @@ def _similarity_search_with_scores( text = md_obj.get("text", "") if "_raw" not in md_obj else md_obj["_raw"] metadata = md_obj.get("metadata", {}) if "_raw" not in md_obj else {} + if not text and not metadata: + # Treat empty text+metadata as no result. + continue # client-side filter if filter: @@ -143,9 +152,11 @@ def _similarity_search_with_scores( if score_threshold is not None and score < score_threshold: continue + doc_id = item.get("id") doc = Document( page_content=text, metadata={**metadata, "_score": score, "_id": item.get("id")}, + id=doc_id, ) docs_with_scores.append((doc, score)) @@ -181,7 +192,16 @@ def similarity_search( fetch_k=fetch_k, **kwargs, ) - return [doc for doc, _ in docs_with_scores] + return [ + Document( + page_content=doc.page_content, + metadata={ + k: v for k, v in doc.metadata.items() if k not in ("_score", "_id") + }, + id=getattr(doc, "id", None), + ) + for doc, _ in docs_with_scores + ] def similarity_search_with_score( self, diff --git a/pytest.ini b/pytest.ini index 70b79b9..3500c10 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,6 +1,6 @@ [pytest] markers = integration: tests that require a running EnVector server and the real EnVector SDK +asyncio_mode = auto testpaths = tests - diff --git a/tests/integration/test_es2_integration.py b/tests/integration_tests/test_e2e.py similarity index 95% rename from tests/integration/test_es2_integration.py rename to tests/integration_tests/test_e2e.py index a7272fd..d400d01 100644 --- a/tests/integration/test_es2_integration.py +++ b/tests/integration_tests/test_e2e.py @@ -136,7 +136,7 @@ def test_e2e_vectorstore_plain_and_cipher(): (d.page_content[:80] + ("..." if len(d.page_content) > 80 else "")), ) assert len(docs) >= 1 - assert all("_id" in d.metadata for d in docs) + assert all(getattr(d, "id", None) or "_id" in d.metadata for d in docs) # optional filter check if 'label' is part of meta if not use_hf: docs_f = store_plain.similarity_search( @@ -153,7 +153,7 @@ def test_e2e_vectorstore_plain_and_cipher(): "[plain] results (explicit embedding e1):", [d.page_content for d in docs] ) assert any(d.page_content == texts[0] for d in docs) - assert all("_id" in d.metadata for d in docs) + assert all(getattr(d, "id", None) or "_id" in d.metadata for d in docs) docs_f = store_plain.similarity_search( "q", k=2, embedding=e2, filter={"label": "B"} ) @@ -189,7 +189,7 @@ def test_e2e_vectorstore_plain_and_cipher(): (d.page_content[:80] + ("..." if len(d.page_content) > 80 else "")), ) assert len(docs_cc) >= 1 - assert all("_id" in d.metadata for d in docs_cc) + assert all(getattr(d, "id", None) or "_id" in d.metadata for d in docs_cc) else: docs_cc = store_cc.similarity_search("q", k=2, embedding=e2) print( @@ -197,7 +197,7 @@ def test_e2e_vectorstore_plain_and_cipher(): [d.page_content for d in docs_cc], ) assert any(d.page_content == texts[1] for d in docs_cc) - assert all("_id" in d.metadata for d in docs_cc) + assert all(getattr(d, "id", None) or "_id" in d.metadata for d in docs_cc) # Cleanup store_plain.client.ev.init_connect(address=address) diff --git a/tests/integration_tests/test_vectorstore.py b/tests/integration_tests/test_vectorstore.py new file mode 100644 index 0000000..d0b8113 --- /dev/null +++ b/tests/integration_tests/test_vectorstore.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +import os +import secrets +from typing import Generator + +import pytest +from langchain_core.embeddings import DeterministicFakeEmbedding +from langchain_core.vectorstores import VectorStore + +from langchain_tests.integration_tests import VectorStoreIntegrationTests + +from langchain_envector.config import ( + ConnectionConfig, + EnvectorConfig, + IndexSettings, + KeyConfig, +) +from langchain_envector.vectorstore import Envector + +pytestmark = pytest.mark.integration + + +def _require_env(name: str) -> str: + value = os.environ.get(name) + if not value: + pytest.skip(f"Set {name} to enable integration test") + return value + + +class TestEnvectorVectorStore(VectorStoreIntegrationTests): + # VectorStoreIntegrationTests provides the standard search/add/get scenarios; + # this class only wires up the Envector fixture and capability flags. + @staticmethod + def get_embeddings() -> DeterministicFakeEmbedding: + # Envector requires dimension in [32, 4096]. + return DeterministicFakeEmbedding(size=32) + + @property + def has_async(self) -> bool: + # Envector does not yet support async methods. + return False + + @property + def has_get_by_ids(self) -> bool: + # Envector does not yet support get by IDs. + return False + + @pytest.fixture() + def vectorstore(self) -> Generator[VectorStore, None, None]: # type: ignore[override] + # Set up Envector vector store for testing. + address = _require_env("ENVECTOR_ADDRESS") + key_path = _require_env("ENVECTOR_KEY_PATH") + key_id = _require_env("ENVECTOR_KEY_ID") + index_name = f"lc_std_{secrets.token_hex(4)}" + + cfg = EnvectorConfig( + connection=ConnectionConfig(address=address), + key=KeyConfig(key_path=key_path, key_id=key_id), + index=IndexSettings( + index_name=index_name, dim=32, query_encryption="plain" + ), + create_if_missing=True, + ) + # Create the vector store. + store = Envector(config=cfg, embeddings=self.get_embeddings()) + + try: + yield store + finally: + try: + # Clean up: delete the created index. + store.client.ev.delete_index(index_name) + except Exception: + pass + + @pytest.mark.xfail( + reason="Envector does not support delete semantics for standard tests." + ) + def test_deleting_documents(self, vectorstore: VectorStore) -> None: + pass + + @pytest.mark.xfail( + reason="Envector does not support delete semantics for standard tests." + ) + def test_deleting_bulk_documents(self, vectorstore: VectorStore) -> None: + pass + + @pytest.mark.xfail( + reason="Envector does not support delete semantics for standard tests." + ) + def test_delete_missing_content(self, vectorstore: VectorStore) -> None: + pass + + @pytest.mark.xfail(reason="Envector does not support update-by-id semantics yet.") + def test_add_documents_by_id_with_mutation(self, vectorstore: VectorStore) -> None: + pass + + @pytest.mark.xfail( + reason="Envector does not support idempotent add-by-id semantics yet." + ) + def test_add_documents_with_ids_is_idempotent( + self, vectorstore: VectorStore + ) -> None: + pass + + @pytest.mark.xfail( + reason="Empty index returns placeholder results in current backend." + ) + def test_vectorstore_is_empty(self, vectorstore: VectorStore) -> None: + pass + + @pytest.mark.xfail( + reason="Empty index returns placeholder results in current backend." + ) + def test_vectorstore_still_empty(self, vectorstore: VectorStore) -> None: + pass + + @pytest.mark.xfail( + reason="Envector does not support delete semantics for standard tests." + ) + async def test_deleting_documents_async(self, vectorstore: VectorStore) -> None: + pass + + @pytest.mark.xfail( + reason="Envector does not support delete semantics for standard tests." + ) + async def test_deleting_bulk_documents_async( + self, vectorstore: VectorStore + ) -> None: + pass + + @pytest.mark.xfail( + reason="Envector does not support delete semantics for standard tests." + ) + async def test_delete_missing_content_async(self, vectorstore: VectorStore) -> None: + pass + + @pytest.mark.xfail(reason="Envector does not support update-by-id semantics yet.") + async def test_add_documents_by_id_with_mutation_async( + self, vectorstore: VectorStore + ) -> None: + pass diff --git a/tests/requirements.txt b/tests/requirements.txt index 02b0ade..e7448e6 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -2,10 +2,11 @@ # Test runner pytest +pytest-asyncio # pyenvector SDK — install from local wheel at repo root # Use a direct wheel path (no PEP 508 direct reference) for maximum pip compatibility. -./pyenvector-1.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +pyenvector # Note: LangChain is optional for tests. Integration tests will fall back to # sentence-transformers if LangChain embeddings are unavailable. @@ -19,4 +20,7 @@ sentence-transformers # PyTorch; required by sentence-transformers (CPU wheels will be resolved per platform) torch -langchain \ No newline at end of file +# LangChain core and test utilities (local installs from repo root) +langchain +langchain-core +langchain-tests diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py index 8178c52..9657895 100644 --- a/tests/test_vectorstore.py +++ b/tests/test_vectorstore.py @@ -19,13 +19,15 @@ def _cfg() -> EnvectorConfig: ) -def test_add_texts_ignores_ids_and_returns_item_ids(): +def test_add_texts_returns_item_ids(): + # Test that add_texts returns the item IDs assigned by the vector store + # Note that user-provided IDs are ignored client = FakeClient() store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) ret_ids = store.add_texts( ["t1", "t2"], metadatas=[{"m": 1}, {"m": 2}], ids=["a", "b"] - ) # ids ignored + ) # input ids ignored # Returned IDs assert len(ret_ids) == 2 @@ -63,7 +65,6 @@ def test_similarity_search_with_filter_and_threshold(): ) assert len(docs) == 1 assert docs[0].page_content == "A" - assert docs[0].metadata["_score"] >= 0.5 def test_similarity_search_handles_string_metadata(): @@ -106,9 +107,6 @@ def test_similarity_search_uses_raw_text_when_not_json(): assert len(docs) == 1 assert docs[0].page_content == "Plain text content without JSON" # user metadata should be empty dict when not provided - assert all( - k in docs[0].metadata for k in ["_score", "_id"] - ) # only system fields present def test_similarity_search_handles_python_literal_metadata(): @@ -183,7 +181,6 @@ def test_similarity_search_with_score_returns_tuples(): assert isinstance(first_doc, LC_Document) assert first_doc.page_content == "Doc0" assert first_doc.metadata["_score"] == first_score - assert first_doc.metadata["_id"] == "s-0" def test_similarity_search_with_score_by_vector_returns_tuples(): @@ -207,7 +204,6 @@ def test_similarity_search_with_score_by_vector_returns_tuples(): doc, score = results[0] assert doc.page_content == "VectorDoc" assert score == doc.metadata["_score"] - assert doc.metadata["_id"] == "sv-0" def test_from_texts_inserts_using_embeddings(): @@ -259,6 +255,22 @@ def test_add_documents_with_embeddings(): assert any('"text": "C2"' in m for m in packed) +def test_add_documents_returns_item_ids(): + # Test that add_documents returns the item IDs assigned by the vector store + # Note that user-provided IDs are ignored + client = FakeClient() + store = Envector(config=_cfg(), embeddings=FakeEmbeddings(dim=4), client=client) + + docs = [ + LC_Document(page_content="D1", metadata={"t": 1}), + LC_Document(page_content="D2", metadata={"t": 2}), + ] + ret_ids = store.add_documents(docs, ids=["user-1", "user-2"]) + + assert len(ret_ids) == 2 + assert ret_ids == [2, 3] + + def test_add_documents_requires_vectors_when_no_embeddings(): client = FakeClient() store = Envector(config=_cfg(), embeddings=None, client=client)