Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
python -m pip install --upgrade pip setuptools wheel
python -m pip install -e .
python -m pip install pytest pre-commit
python -m pip install langchain-tests

- name: Lint and format
run: pre-commit run --all-files --show-diff-on-failure
Expand Down
133 changes: 95 additions & 38 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ Key dataclasses live in `libs/envector/config.py`:
- Filtering happens client-side; ensure metadata is JSON for structured filters.

## Examples
- Configuration
```python
### Configuration

```python
from langchain_envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig

cfg = EnvectorConfig(
Expand All @@ -66,53 +67,109 @@ Key dataclasses live in `libs/envector/config.py`:
)
```

- Add documents (from LangChain Documents):

```python
from langchain_core.documents import Document
from langchain_envector.vectorstore import Envector

docs = [
Document(
page_content="chunk-1",
metadata={"source": "paper.pdf", "page": 1, "chunk": 0}
),
Document(
page_content="chunk-2",
metadata={"source": "paper.pdf", "page": 1, "chunk": 1}
),
]

store = Envector(config=cfg, embeddings=emb)
store.add_documents(docs)
```
### Add documents (from LangChain Documents):

The method `add_texts` is also available to store texts.
```python
from langchain_core.documents import Document
from langchain_envector.vectorstore import Envector

- Similarity search
docs = [
Document(
page_content="chunk-1",
metadata={"source": "paper.pdf", "page": 1, "chunk": 0}
),
Document(
page_content="chunk-2",
metadata={"source": "paper.pdf", "page": 1, "chunk": 1}
),
]

```python
results = store.similarity_search_with_score(query, k=3)
for doc, score in results:
print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")
```
store = Envector(config=cfg, embeddings=emb)
store.add_documents(docs)
```

Or you can use `add_texts` to store vectors and their texts.

```python
store.add_texts(
texts=["chunk 3"],
metadatas=[{"source": "paper.pdf", "page": 1, "chunk": 2}]
)
```

### Similarity search

```python
results = store.similarity_search(query, k=1)
for doc in results:
print(f"* {doc.page_content} [{doc.metadata}]")
```

#### Similarity Search with Score

```python
results = store.similarity_search_with_score(query, k=1)
for doc, score in results:
print(f"* [SIM={score:.3f}] {doc.page_content} [{doc.metadata}]")
```


#### Similarity Search with Vector

```python
query_embedding = embeddings.embed_query(query)
print(f"Query: {query_embedding[:3]}")
results = store.similarity_search_by_vector(query_embedding, k=3)
for doc in results:
print(f"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]")
```

The methods `similarity_search` and `similarity_search_with_vector` (with `embeddings.embed_query()`) are also available to perform vector search.

## Troubleshooting
- Connection issues: verify EnVector address and registered keys.
- Embeddings mismatch: ensure embedding dimension equals `index.dim` when supplying vectors.
- Unexpected raw strings: confirm inserts used the JSON envelope.
- Key Issues: check key's metadata to sync with the registered key if facing any key issue.

## Testing Without EnVector
- Run unit tests offline (no EnVector or SDK required):
- `python -m pytest -q -m "not integration"`
- or `python scripts/run_unit_tests.py`
- Run integration tests (requires server and keys):
- Export `ENVECTOR_ADDRESS`, `ENVECTOR_KEY_PATH`, `ENVECTOR_KEY_ID`
- Optional: `ENVECTOR_USE_EMBEDDINGS=1`, `ENVECTOR_EMB_MODEL`, `ENVECTOR_USE_HF_DATASET=1`
- `python -m pytest -q -m integration -s`
## Test

Before running tests, install dependencies for pytest:

```bash
pip install -r tests/requirements.txt
```

### Unit Test

Run unit tests offline (no EnVector or SDK required)

```bash
python -m pytest -q -m "not integration"
# or
python scripts/run_unit_tests.py
```

### Integration Test

Run integration tests (requires enVector server)

1. Prepare the running enVector server

2. Export the environment variables:

- `ENVECTOR_ADDRESS`
- `ENVECTOR_KEY_PATH`
- `ENVECTOR_KEY_ID`
- `ENVECTOR_INDEX_NAME`
- (Optional) `ENVECTOR_USE_EMBEDDINGS=1`
- (Optional) `ENVECTOR_EMB_MODEL`
- (Optional) `ENVECTOR_USE_HF_DATASET=1`

3. Run the following command:

```bash
python -m pytest -q -m integration -s
```

## Contributing
See [`CONTRIBUTE.md`](CONTRIBUTE.md) for development, testing, and PR guidelines.
22 changes: 21 additions & 1 deletion libs/envector/langchain_envector/vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,18 @@ def _similarity_search_with_scores(
else results
)

if not result:
return []

docs_with_scores: List[Tuple[Document, float]] = []
# Iterate from top-1 to top-k
for item in result:
# item = {"id": ..., "score": float, "metadata": [str] or {...}}
score = float(item.get("score", 0.0))
md_obj_raw = item.get("metadata")
if md_obj_raw in (None, "", [], {}):
# Skip placeholder/empty hits returned by the backend.
continue

# Metadata encryption/decryption is handled by the SDK.
# Envector currently supports a single associated data field (string).
Expand All @@ -133,6 +139,9 @@ def _similarity_search_with_scores(

text = md_obj.get("text", "") if "_raw" not in md_obj else md_obj["_raw"]
metadata = md_obj.get("metadata", {}) if "_raw" not in md_obj else {}
if not text and not metadata:
# Treat empty text+metadata as no result.
continue

# client-side filter
if filter:
Expand All @@ -143,9 +152,11 @@ def _similarity_search_with_scores(
if score_threshold is not None and score < score_threshold:
continue

doc_id = item.get("id")
doc = Document(
page_content=text,
metadata={**metadata, "_score": score, "_id": item.get("id")},
id=doc_id,
)
docs_with_scores.append((doc, score))

Expand Down Expand Up @@ -181,7 +192,16 @@ def similarity_search(
fetch_k=fetch_k,
**kwargs,
)
return [doc for doc, _ in docs_with_scores]
return [
Document(
page_content=doc.page_content,
metadata={
k: v for k, v in doc.metadata.items() if k not in ("_score", "_id")
},
id=getattr(doc, "id", None),
)
for doc, _ in docs_with_scores
]

def similarity_search_with_score(
self,
Expand Down
2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[pytest]
markers =
integration: tests that require a running EnVector server and the real EnVector SDK
asyncio_mode = auto
testpaths =
tests

Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def test_e2e_vectorstore_plain_and_cipher():
(d.page_content[:80] + ("..." if len(d.page_content) > 80 else "")),
)
assert len(docs) >= 1
assert all("_id" in d.metadata for d in docs)
assert all(getattr(d, "id", None) or "_id" in d.metadata for d in docs)
# optional filter check if 'label' is part of meta
if not use_hf:
docs_f = store_plain.similarity_search(
Expand All @@ -153,7 +153,7 @@ def test_e2e_vectorstore_plain_and_cipher():
"[plain] results (explicit embedding e1):", [d.page_content for d in docs]
)
assert any(d.page_content == texts[0] for d in docs)
assert all("_id" in d.metadata for d in docs)
assert all(getattr(d, "id", None) or "_id" in d.metadata for d in docs)
docs_f = store_plain.similarity_search(
"q", k=2, embedding=e2, filter={"label": "B"}
)
Expand Down Expand Up @@ -189,15 +189,15 @@ def test_e2e_vectorstore_plain_and_cipher():
(d.page_content[:80] + ("..." if len(d.page_content) > 80 else "")),
)
assert len(docs_cc) >= 1
assert all("_id" in d.metadata for d in docs_cc)
assert all(getattr(d, "id", None) or "_id" in d.metadata for d in docs_cc)
else:
docs_cc = store_cc.similarity_search("q", k=2, embedding=e2)
print(
"[cipher] results (explicit embedding e2):",
[d.page_content for d in docs_cc],
)
assert any(d.page_content == texts[1] for d in docs_cc)
assert all("_id" in d.metadata for d in docs_cc)
assert all(getattr(d, "id", None) or "_id" in d.metadata for d in docs_cc)

# Cleanup
store_plain.client.ev.init_connect(address=address)
Expand Down
Loading