Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/workflows/pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: PR Checks

on:
pull_request:
branches:
- main

concurrency:
group: pr-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
checks:
runs-on: [self-hosted, linux, no-gpu]

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
python -m pip install -e .
python -m pip install pytest pre-commit

- name: Lint and format
run: pre-commit run --all-files --show-diff-on-failure

- name: Run unit tests
run: python -m pytest -q -m "not integration"
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ Thumbs.db

# Sensitive local data
keys/
VECTORSTORE.md

# External symlinks (local workspace references)
es2-msa
es2-msa/
es2-deploy
es2-deploy/

# Local helper scripts
run_unit_tests.py

# Jupyter
.ipynb_checkpoints/
Expand Down
10 changes: 10 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.1
hooks:
- id: ruff
- repo: https://github.com/psf/black
rev: 24.10.0
hooks:
- id: black
language_version: python3.11
62 changes: 59 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ Encrypted vector search for LangChain using Envector (ES2), powered by homomorph
- `python3.11 -m venv .venv && source .venv/bin/activate`
- Install runtime dependencies:
- `pip install -U pip setuptools wheel`
- `pip install es2==1.1.0 langchain sentence-transformers`
- `pip install es2 langchain sentence-transformers`

## Usage Overview
1. Configure Envector using `EnvectorConfig`, pointing to your ES2 endpoint and keys.
2. Initialize embeddings (or provide pre-computed vectors).
3. Instantiate `Envector(config=cfg, embeddings=emb)` and call `add_texts` or `as_retriever`.
3. Instantiate `Envector(config=cfg, embeddings=emb)` and call `add_texts`, `add_documents`, or use `as_retriever`.
4. Run `similarity_search` or plug the retriever into your LangChain pipeline.

> See `notebooks/` for end-to-end walkthroughs and the `libs/envector` package for implementation details.
Expand All @@ -27,7 +27,7 @@ Encrypted vector search for LangChain using Envector (ES2), powered by homomorph
Key dataclasses live in `libs/envector/config.py`:
- `ConnectionConfig`: address or host/port for ES2.
- `KeyConfig`: key path, key ID, optional preset/eval mode.
- `IndexSettings`: index name, dimension (16–4096), query encryption mode, optional output fields and fetch parameters.
- `IndexSettings`: index name, dimension (32–4096), query encryption mode, optional output fields and fetch parameters.
- `EnvectorConfig`: wraps the above and enables auto-creation via `create_if_missing`.

## Data Model
Expand All @@ -41,10 +41,66 @@ Key dataclasses live in `libs/envector/config.py`:
- Manual item IDs are not accepted; returned IDs from `add_texts` are ephemeral.
- Filtering happens client-side; ensure metadata is JSON for structured filters.

## Examples
- Configuration
```python
from langchain_envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig

cfg = EnvectorConfig(
connection=ConnectionConfig(
address=ES2_ADDRESS,
access_token=ES2_ACCESS_TOKEN
),
key=KeyConfig(
key_path=ES2_KEY_PATH,
key_id=ES2_KEY_ID,
preset="ip",
eval_mode="rmp"
),
index=IndexSettings(
index_name=INDEX_NAME,
dim=vector_dim,
query_encryption="cipher"
),
create_if_missing=True,
)
```

- Add documents (from LangChain Documents):

```python
from langchain_core.documents import Document
from langchain_envector.vectorstore import Envector

docs = [
Document(
page_content="chunk-1",
metadata={"source": "paper.pdf", "page": 1, "chunk": 0}
),
Document(
page_content="chunk-2",
metadata={"source": "paper.pdf", "page": 1, "chunk": 1}
),
]

store = Envector(config=cfg, embeddings=emb)
store.add_documents(docs)
```

## Troubleshooting
- Connection issues: verify ES2 address and registered keys.
- Embeddings mismatch: ensure embedding dimension equals `index.dim` when supplying vectors.
- Unexpected raw strings: confirm inserts used the JSON envelope.
Comment thread
euphoria0-0 marked this conversation as resolved.
- Key Issues: check key's metadata to sync with the registered key if facing any key issue.

## Testing Without ES2
- Run unit tests offline (no ES2 or SDK required):
- `python -m pytest -q -m "not integration"`
- or `python scripts/run_unit_tests.py`
- Run integration tests (requires server and keys):
- Export `ES2_ADDRESS`, `ES2_KEY_PATH`, `ES2_KEY_ID`
- Optional: `ES2_USE_EMBEDDINGS=1`, `ES2_EMB_MODEL`, `ES2_USE_HF_DATASET=1`
- `python -m pytest -q -m integration -s`

## Contributing
See [`CONTRIBUTE.md`](CONTRIBUTE.md) for development, testing, and PR guidelines.
12 changes: 9 additions & 3 deletions libs/envector/examples/basic_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,22 @@

from __future__ import annotations

from libs.envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig
from libs.envector.config import (
ConnectionConfig,
EnvectorConfig,
IndexSettings,
KeyConfig,
)
from libs.envector.vectorstore import Envector


def main():
# Replace with your actual settings
cfg = EnvectorConfig(
connection=ConnectionConfig(address="localhost:50050"),
key=KeyConfig(key_path="./keys", key_id="example_key", preset="ip", eval_mode="rmp"),
key=KeyConfig(
key_path="./keys", key_id="example_key", preset="ip", eval_mode="rmp"
),
index=IndexSettings(index_name="demo", dim=384, query_encryption="plain"),
create_if_missing=True,
)
Expand All @@ -43,4 +50,3 @@ def main():

if __name__ == "__main__":
main()

16 changes: 12 additions & 4 deletions libs/envector/examples/cipher_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,24 @@

from __future__ import annotations

from libs.envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig
from libs.envector.config import (
ConnectionConfig,
EnvectorConfig,
IndexSettings,
KeyConfig,
)
from libs.envector.vectorstore import Envector


def main():
cfg = EnvectorConfig(
connection=ConnectionConfig(address="localhost:50050"),
key=KeyConfig(key_path="./keys", key_id="example_key", preset="ip", eval_mode="rmp"),
index=IndexSettings(index_name="demo_cipher", dim=384, query_encryption="cipher"),
key=KeyConfig(
key_path="./keys", key_id="example_key", preset="ip", eval_mode="rmp"
),
index=IndexSettings(
index_name="demo_cipher", dim=384, query_encryption="cipher"
),
create_if_missing=True,
)

Expand All @@ -38,4 +47,3 @@ def main():

if __name__ == "__main__":
main()

22 changes: 18 additions & 4 deletions libs/envector/examples/ingest_synthetic_1k.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
from pathlib import Path
from typing import List

from libs.envector.config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig
from libs.envector.config import (
ConnectionConfig,
EnvectorConfig,
IndexSettings,
KeyConfig,
)
from libs.envector.vectorstore import Envector


Expand All @@ -34,7 +39,12 @@ def main():
ap.add_argument("--key-path", required=True)
ap.add_argument("--key-id", required=True)
ap.add_argument("--index-name", required=True)
ap.add_argument("--dim", type=int, required=False, help="If omitted and --use-embeddings, infer from model.")
ap.add_argument(
"--dim",
type=int,
required=False,
help="If omitted and --use-embeddings, infer from model.",
)
ap.add_argument("--dataset", default="data/synthetic_rag_1k.jsonl")
ap.add_argument("--use-embeddings", action="store_true")
ap.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2")
Expand All @@ -52,7 +62,9 @@ def main():

cfg = EnvectorConfig(
connection=ConnectionConfig(address=args.address),
key=KeyConfig(key_path=args.key_path, key_id=args.key_id, preset="ip", eval_mode="rmp"),
key=KeyConfig(
key_path=args.key_path, key_id=args.key_id, preset="ip", eval_mode="rmp"
),
index=IndexSettings(
index_name=args.index_name,
dim=(args.dim if args.dim is not None else inferred_dim or 0),
Expand All @@ -76,7 +88,9 @@ def main():
if embeddings is None:
# Without embeddings, require manual vectors; here we simply skip.
# Users should provide --use-embeddings or adapt to their vector source.
raise ValueError("--use-embeddings is required unless you provide vectors explicitly.")
raise ValueError(
"--use-embeddings is required unless you provide vectors explicitly."
)
store.add_texts(t_batch, metadatas=m_batch)

print(f"Inserted {len(texts)} documents into index '{args.index_name}'")
Expand Down
9 changes: 7 additions & 2 deletions libs/envector/langchain_envector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,10 @@
from .vectorstore import Envector
from .config import ConnectionConfig, EnvectorConfig, IndexSettings, KeyConfig

__all__ = ["Envector", "ConnectionConfig", "EnvectorConfig", "IndexSettings", "KeyConfig"]

__all__ = [
"Envector",
"ConnectionConfig",
"EnvectorConfig",
"IndexSettings",
"KeyConfig",
]
7 changes: 3 additions & 4 deletions libs/envector/langchain_envector/client.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import Optional

from .config import EnvectorConfig


Expand Down Expand Up @@ -34,7 +32,9 @@ def init(self):
else:
if not (c.host and c.port):
raise ValueError("Either address or host+port must be provided.")
es2_client.init_connect(host=c.host, port=c.port, access_token=c.access_token)
es2_client.init_connect(
host=c.host, port=c.port, access_token=c.access_token
)

# Key path baseline for Index
from es2.index import Index as _Index
Expand Down Expand Up @@ -79,4 +79,3 @@ def es2(self):
if self._es2 is None:
raise RuntimeError("Client not initialized. Call init().")
return self._es2

1 change: 0 additions & 1 deletion libs/envector/langchain_envector/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,3 @@ class EnvectorConfig:
key: KeyConfig
index: IndexSettings
create_if_missing: bool = True

4 changes: 3 additions & 1 deletion libs/envector/langchain_envector/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@


class EnvectorRetriever:
def __init__(self, store: Envector, *, search_kwargs: Optional[Dict[str, Any]] = None) -> None:
def __init__(
self, store: Envector, *, search_kwargs: Optional[Dict[str, Any]] = None
) -> None:
self.store = store
self.search_kwargs = search_kwargs or {}

Expand Down
26 changes: 19 additions & 7 deletions libs/envector/langchain_envector/types.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union, overload
from typing import Any, Callable, Dict, List, Optional, Protocol


class Embeddings(Protocol):
Expand All @@ -10,10 +10,14 @@ class Embeddings(Protocol):
LangChain-compatible embeddings typically implement these two methods.
"""

def embed_documents(self, texts: List[str]) -> List[List[float]]: # pragma: no cover - interface only
def embed_documents(
self, texts: List[str]
) -> List[List[float]]: # pragma: no cover - interface only
...

def embed_query(self, text: str) -> List[float]: # pragma: no cover - interface only
def embed_query(
self, text: str
) -> List[float]: # pragma: no cover - interface only
...


Expand Down Expand Up @@ -53,8 +57,6 @@ def unpack_metadata(raw: Any) -> Dict[str, Any]:
if isinstance(raw, dict):
return raw

print("slafjklshglkhslafhlksadjlghsal;hf")

# Some responses wrap the payload in a single-element list.
if isinstance(raw, (list, tuple)):
if len(raw) == 1:
Expand Down Expand Up @@ -94,8 +96,13 @@ def unpack_metadata(raw: Any) -> Dict[str, Any]:

# --- Embeddings adaptation helpers -----------------------------------------------------


class _CallableEmbeddings:
def __init__(self, docs_fn: Callable[[List[str]], List[List[float]]], query_fn: Callable[[str], List[float]]):
def __init__(
self,
docs_fn: Callable[[List[str]], List[List[float]]],
query_fn: Callable[[str], List[float]],
):
self._docs_fn = docs_fn
self._query_fn = query_fn

Expand Down Expand Up @@ -132,7 +139,12 @@ def query_fn(text: str) -> List[float]:
return _CallableEmbeddings(docs_fn, query_fn)

# Case 3: Tuple of callables
if isinstance(emb, tuple) and len(emb) == 2 and callable(emb[0]) and callable(emb[1]):
if (
isinstance(emb, tuple)
and len(emb) == 2
and callable(emb[0])
and callable(emb[1])
):
docs_fn, query_fn = emb # type: ignore[assignment]
return _CallableEmbeddings(docs_fn, query_fn)

Expand Down
Loading