From cd2a775ac5d96b726047dafd180bb496887c720e Mon Sep 17 00:00:00 2001 From: Chengbiao Jin Date: Wed, 24 Jun 2026 15:01:50 -0700 Subject: [PATCH 01/33] Upgrade dependencies and migrate to LangChain 1.x - Float pinned dependencies to minimum-version constraints and upgrade the core stack to current releases. - Migrate all imports (top-level and lazy in-function) to the LangChain 1.x module layout. - Drop the unused top-level langchain package. - Remove a dead, unreferenced chat-agent module. - Update the model-evaluation tests to use replacements for the removed LangChain evaluators. Refs: GML-2137 --- common/chunkers/html_chunker.py | 2 +- common/chunkers/markdown_chunker.py | 2 +- common/chunkers/recursive_chunker.py | 2 +- common/db/schema_extraction.py | 2 +- common/embeddings/embedding_services.py | 2 +- .../LLMEntityRelationshipExtractor.py | 8 +- common/llm_services/aws_sagemaker_endpoint.py | 2 +- common/requirements.txt | 371 +++++++++--------- graphrag/Dockerfile | 2 +- graphrag/app/agent.py | 124 ------ graphrag/app/agent/agent_generation.py | 2 +- graphrag/app/agent/agent_graph.py | 2 +- .../app/agent/agent_hallucination_check.py | 2 +- graphrag/app/agent/agent_rewrite.py | 2 +- graphrag/app/agent/agent_router.py | 2 +- graphrag/app/agent/agent_usefulness_check.py | 2 +- graphrag/app/agent/method_selector.py | 2 +- graphrag/app/supportai/supportai_ingest.py | 4 +- graphrag/app/tools/generate_cypher.py | 6 +- graphrag/app/tools/generate_function.py | 8 +- graphrag/app/tools/generate_gsql.py | 6 +- graphrag/app/tools/map_question_to_schema.py | 8 +- graphrag/tests/test_connections.py | 5 +- graphrag/tests/test_service.py | 49 ++- 24 files changed, 267 insertions(+), 350 deletions(-) delete mode 100644 graphrag/app/agent.py diff --git a/common/chunkers/html_chunker.py b/common/chunkers/html_chunker.py index 83b3477..49df707 100644 --- a/common/chunkers/html_chunker.py +++ b/common/chunkers/html_chunker.py @@ -17,7 +17,7 @@ from common.chunkers.base_chunker import BaseChunker from common.chunkers.separators import TEXT_SEPARATORS from langchain_text_splitters import HTMLSectionSplitter -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_text_splitters import RecursiveCharacterTextSplitter _DEFAULT_CHUNK_SIZE = 2048 diff --git a/common/chunkers/markdown_chunker.py b/common/chunkers/markdown_chunker.py index 85c1a82..ab8ba52 100644 --- a/common/chunkers/markdown_chunker.py +++ b/common/chunkers/markdown_chunker.py @@ -15,7 +15,7 @@ from common.chunkers.base_chunker import BaseChunker from common.chunkers.separators import TEXT_SEPARATORS from langchain_text_splitters.markdown import ExperimentalMarkdownSyntaxTextSplitter -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_text_splitters import RecursiveCharacterTextSplitter # When chunk_size is not configured, cap any heading-section that exceeds this # so that form-based PDFs (tables/bold but no # headings) are not left as a diff --git a/common/chunkers/recursive_chunker.py b/common/chunkers/recursive_chunker.py index 69ee83a..b996a87 100644 --- a/common/chunkers/recursive_chunker.py +++ b/common/chunkers/recursive_chunker.py @@ -14,7 +14,7 @@ from common.chunkers.base_chunker import BaseChunker from common.chunkers.separators import TEXT_SEPARATORS -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_text_splitters import RecursiveCharacterTextSplitter _DEFAULT_CHUNK_SIZE = 2048 diff --git a/common/db/schema_extraction.py b/common/db/schema_extraction.py index c1fe07c..06c5845 100644 --- a/common/db/schema_extraction.py +++ b/common/db/schema_extraction.py @@ -28,7 +28,7 @@ import re from typing import Iterable, List, Optional -from langchain.prompts import PromptTemplate +from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from common.db.schema_utils import ( diff --git a/common/embeddings/embedding_services.py b/common/embeddings/embedding_services.py index e032c54..de74ccc 100644 --- a/common/embeddings/embedding_services.py +++ b/common/embeddings/embedding_services.py @@ -3,7 +3,7 @@ import time from typing import List -from langchain.schema.embeddings import Embeddings +from langchain_core.embeddings import Embeddings from langchain_openai import OpenAIEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings from langchain_ollama import OllamaEmbeddings diff --git a/common/extractors/LLMEntityRelationshipExtractor.py b/common/extractors/LLMEntityRelationshipExtractor.py index 43fdb67..ad5a7e0 100644 --- a/common/extractors/LLMEntityRelationshipExtractor.py +++ b/common/extractors/LLMEntityRelationshipExtractor.py @@ -408,8 +408,8 @@ def _build_rels(self, formatted_rels: list) -> list: return relationships async def adocument_er_extraction(self, document): - from langchain.prompts import ChatPromptTemplate - from langchain.output_parsers import PydanticOutputParser + from langchain_core.prompts import ChatPromptTemplate + from langchain_core.output_parsers import PydanticOutputParser parser = PydanticOutputParser(pydantic_object=KnowledgeGraph) @@ -447,8 +447,8 @@ async def adocument_er_extraction(self, document): def document_er_extraction(self, document): - from langchain.prompts import ChatPromptTemplate - from langchain.output_parsers import PydanticOutputParser + from langchain_core.prompts import ChatPromptTemplate + from langchain_core.output_parsers import PydanticOutputParser parser = PydanticOutputParser(pydantic_object=KnowledgeGraph) diff --git a/common/llm_services/aws_sagemaker_endpoint.py b/common/llm_services/aws_sagemaker_endpoint.py index 5134497..e331b70 100644 --- a/common/llm_services/aws_sagemaker_endpoint.py +++ b/common/llm_services/aws_sagemaker_endpoint.py @@ -34,7 +34,7 @@ def transform_output(self, output: bytes): class AWS_SageMaker_Endpoint(LLM_Model): def __init__(self, config): super().__init__(config) - from langchain.llms import SagemakerEndpoint + from langchain_community.llms import SagemakerEndpoint client = boto3.client( "sagemaker-runtime", diff --git a/common/requirements.txt b/common/requirements.txt index a52f992..c1a5793 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -1,185 +1,200 @@ -aiochannel==1.3.0 -aiohappyeyeballs==2.6.1 -aiohttp==3.12.13 -aiosignal==1.3.2 -annotated-types==0.7.0 -anyio==4.9.0 -appdirs==1.4.4 -argon2-cffi==25.1.0 -argon2-cffi-bindings==21.2.0 -async-timeout==5.0.1 -asyncer==0.0.8 -attrs==25.3.0 -azure-core==1.34.0 -azure-storage-blob==12.25.1 -backoff==2.2.1 -beautifulsoup4==4.13.4 +aiochannel>=1.3.0 +aiohappyeyeballs>=2.6.1 +aiohttp>=3.12.13 +aiosignal>=1.3.2 +annotated-types>=0.7.0 +anyio>=4.14.0 +appdirs>=1.4.4 +argon2-cffi>=25.1.0 +argon2-cffi-bindings>=21.2.0 +async-timeout>=5.0.1 +asyncer>=0.0.8 +attrs>=25.3.0 +azure-core>=1.34.0 +azure-storage-blob>=12.25.1 +backoff>=2.2.1 +beautifulsoup4>=4.13.4 boto3>=1.38.45 botocore>=1.38.45 -cachetools==5.5.2 -certifi==2025.6.15 -cffi==1.17.1 -chardet==5.2.0 -charset-normalizer==3.4.2 -click==8.2.1 -contourpy==1.3.2 -cryptography==45.0.4 -cycler==0.12.1 -dataclasses-json==0.6.7 -deepdiff==8.5.0 -distro==1.9.0 -docker-pycreds==0.4.0 -docstring_parser==0.16 -emoji==2.14.1 -environs==14.2.0 -exceptiongroup==1.3.0 -fastapi==0.118.0 -filelock==3.18.0 -filetype==1.2.0 -fonttools==4.58.4 -frozenlist==1.7.0 -fsspec==2025.5.1 -gitdb==4.0.12 -GitPython==3.1.44 -google-api-core==2.25.1 -google-auth==2.40.3 -google-cloud-aiplatform==1.99.0 -google-cloud-bigquery==3.34.0 -google-cloud-core==2.4.3 -google-cloud-resource-manager==1.14.2 -google-cloud-storage==2.19.0 -google-crc32c==1.7.1 -google-resumable-media==2.7.2 -googleapis-common-protos==1.70.0 -greenlet==3.2.3 -groq==0.29.0 -grpc-google-iam-v1==0.14.2 -grpcio==1.73.1 -grpcio-status==1.73.1 -h11==0.16.0 -httpcore==1.0.9 -httptools==0.6.4 -httpx==0.28.1 -huggingface-hub==0.33.1 -ibm-cos-sdk==2.14.2 -ibm-cos-sdk-core==2.14.2 -ibm-cos-sdk-s3transfer==2.14.2 -ibm_watsonx_ai==1.3.26 -idna==3.10 -importlib_metadata==8.7.0 -iniconfig==2.1.0 -isodate==0.7.2 -jiter==0.10.0 -jmespath==1.0.1 -joblib==1.5.1 -jq==1.9.1 -jsonpatch==1.33 -jsonpath-python==1.0.6 -jsonpointer==3.0.0 -kiwisolver==1.4.8 -langchain>=0.3.26 +cachetools>=5.5.2 +certifi>=2025.6.15 +cffi>=1.17.1 +chardet>=5.2.0 +charset-normalizer>=3.4.2 +click>=8.4.1 +contourpy>=1.3.2 +cryptography>=45.0.4 +cycler>=0.12.1 +dataclasses-json>=0.6.7 +deepdiff>=8.5.0 +distro>=1.9.0 +docker-pycreds>=0.4.0 +docstring_parser>=0.16 +emoji>=2.14.1 +environs>=14.2.0 +exceptiongroup>=1.3.0 +fastapi>=0.138.0 +filelock>=3.18.0 +filetype>=1.2.0 +fonttools>=4.58.4 +frozenlist>=1.7.0 +fsspec>=2025.5.1 +gitdb>=4.0.12 +GitPython>=3.1.44 +google-api-core>=2.25.1 +google-auth>=2.40.3 +google-cloud-aiplatform>=1.99.0 +google-cloud-bigquery>=3.34.0 +google-cloud-core>=2.4.3 +google-cloud-resource-manager>=1.14.2 +google-cloud-storage>=2.19.0 +google-crc32c>=1.7.1 +google-resumable-media>=2.7.2 +googleapis-common-protos>=1.70.0 +greenlet>=3.2.3 +groq>=0.29.0 +grpc-google-iam-v1>=0.14.2 +grpcio>=1.73.1 +grpcio-status>=1.73.1 +h11>=0.16.0 +httpcore>=1.0.9 +httptools>=0.8.0 +httpx>=0.28.1 +huggingface-hub>=0.33.1 +ibm-cos-sdk>=2.14.2 +ibm-cos-sdk-core>=2.14.2 +ibm-cos-sdk-s3transfer>=2.14.2 +ibm_watsonx_ai>=1.3.26 +idna>=3.10 +importlib_metadata>=8.7.0 +iniconfig>=2.1.0 +isodate>=0.7.2 +jiter>=0.10.0 +jmespath>=1.0.1 +joblib>=1.5.1 +jq>=1.9.1 +jsonpatch>=1.33 +jsonpath-python>=1.0.6 +jsonpointer>=3.0.0 +kiwisolver>=1.4.8 langchain-core>=0.3.26 -langchain_google_genai==2.1.8 -langchain-google-vertexai==2.1.2 -langchain-community==0.3.26 -langchain-experimental==0.3.5rc1 -langchain-groq==0.3.4 -langchain-ibm==0.3.12 -langchain-openai==0.3.26 -langchain-ollama==0.3.7 -langchain-text-splitters==0.3.8 -langchain-aws==0.2.31 -langchainhub==0.1.21 -langdetect==1.0.9 -langgraph==0.4.10 -langgraph-checkpoint==2.1.0 -langsmith==0.4.2 -Levenshtein==0.27.1 -lomond==0.3.3 -lxml==6.0.0 -marshmallow==3.26.1 -matplotlib==3.10.3 -multidict==6.5.1 -mypy-extensions==1.1.0 -nest-asyncio==1.6.0 -nltk==3.9.1 +langchain_google_genai>=2.1.8 +langchain-google-vertexai>=2.1.2 +langchain-community>=0.3.26 +langchain-experimental>=0.3.5rc1 +langchain-groq>=0.3.4 +langchain-ibm>=0.3.12 +langchain-openai>=0.3.26 +langchain-ollama>=0.3.7 +langchain-text-splitters>=0.3.8 +langchain-aws>=0.2.31 +langchainhub>=0.1.21 +langdetect>=1.0.9 +langgraph>=0.4.10 +langgraph-checkpoint>=2.1.0 +langsmith>=0.4.2 +Levenshtein>=0.27.1 +lomond>=0.3.3 +lxml>=6.0.0 +marshmallow>=3.26.1 +matplotlib>=3.10.3 +multidict>=6.5.1 +mypy-extensions>=1.1.0 +nest-asyncio>=1.6.0 +nltk>=3.9.1 numpy>=1, <2 -openai==1.92.2 +openai>=1.92.2 openpyxl>=3.1.0 xlrd>=2.0.1 -ordered-set==4.1.0 -orjson==3.10.18 -packaging==24.2 -pandas==2.2.3 -#pathtools==0.1.2 -pillow==11.2.1 -PyMuPDF==1.26.6 -pymupdf4llm==0.2.0 -platformdirs==4.3.8 -pluggy==1.6.0 -prometheus_client==0.22.1 -proto-plus==1.26.1 -protobuf==6.31.1 -psutil==7.0.0 -pyarrow==20.0.0 -pyasn1==0.6.1 -pyasn1_modules==0.4.2 -pycparser==2.22 -pycryptodome==3.23.0 -pydantic==2.11.7 -pydantic_core==2.33.2 -pygit2==1.18.0 -pyparsing==3.2.3 -pypdf==5.6.1 -pytest==8.4.1 -python-docx==1.1.2 -pytesseract==0.3.10 -python-dateutil==2.9.0.post0 -python-dotenv==1.1.1 -python-multipart==0.0.20 -python-iso639==2025.2.18 -python-magic==0.4.27 -pyTigerDriver==1.0.15 +ordered-set>=4.1.0 +orjson>=3.10.18 +packaging>=24.2 +pandas>=2.2.3 +#pathtools>=0.1.2 +pillow>=11.2.1 +PyMuPDF>=1.27.2.3 +pymupdf4llm>=1.27.2.3 +platformdirs>=4.3.8 +pluggy>=1.6.0 +prometheus_client>=0.22.1 +proto-plus>=1.26.1 +protobuf>=6.31.1 +psutil>=7.0.0 +pyarrow>=20.0.0 +pyasn1>=0.6.1 +pyasn1_modules>=0.4.2 +pycparser>=2.22 +pycryptodome>=3.23.0 +pydantic>=2.11.7 +pydantic_core>=2.33.2 +pygit2>=1.18.0 +pyparsing>=3.2.3 +pypdf>=5.6.1 +pytest>=8.4.1 +python-docx>=1.1.2 +pytesseract>=0.3.10 +python-dateutil>=2.9.0.post0 +python-dotenv>=1.1.1 +python-multipart>=0.0.32 +python-iso639>=2025.2.18 +python-magic>=0.4.27 +pyTigerDriver>=1.0.15 pyTigerGraph>=2.0.4 -pytz==2025.2 -PyYAML==6.0.2 -rapidfuzz==3.13.0 -regex==2024.11.6 -requests==2.32.4 -requests-toolbelt==1.0.0 -rsa==4.9.1 -s3transfer==0.13.0 -scikit-learn==1.7.0 -scipy==1.16.0 -sentry-sdk==2.31.0 -setproctitle==1.3.6 -shapely==2.1.1 -six==1.17.0 -smmap==5.0.2 -sniffio==1.3.1 -soupsieve==2.7 -SQLAlchemy==2.0.41 -starlette==0.48.0 -tabulate==0.9.0 -tenacity==9.1.2 -threadpoolctl==3.6.0 -tiktoken==0.9.0 -tqdm==4.67.1 -types-requests==2.32.4.20250611 -types-urllib3==1.26.25.14 -typing-inspect==0.9.0 -typing_extensions==4.14.0 -tzdata==2025.2 -ujson==5.10.0 -unstructured==0.18.1 -unstructured-client==0.37.2 -urllib3==2.5.0 -uvicorn==0.34.3 -uvloop==0.21.0 -validators==0.35.0 -wandb==0.20.1 -watchfiles==1.1.0 -websockets==15.0.1 -wrapt==1.17.2 -yarl==1.20.1 -zipp==3.23.0 +pytz>=2025.2 +PyYAML>=6.0.2 +rapidfuzz>=3.13.0 +regex>=2024.11.6 +requests>=2.32.4 +requests-toolbelt>=1.0.0 +rsa>=4.9.1 +s3transfer>=0.13.0 +scikit-learn>=1.7.0 +scipy>=1.16.0 +sentry-sdk>=2.31.0 +setproctitle>=1.3.6 +shapely>=2.1.1 +six>=1.17.0 +smmap>=5.0.2 +sniffio>=1.3.1 +soupsieve>=2.7 +SQLAlchemy>=2.0.41 +starlette>=1.3.1 +tabulate>=0.9.0 +tenacity>=9.1.2 +threadpoolctl>=3.6.0 +tiktoken>=0.9.0 +tqdm>=4.67.1 +types-requests>=2.32.4.20250611 +types-urllib3>=1.26.25.14 +typing-inspect>=0.9.0 +typing_extensions>=4.14.0 +tzdata>=2025.2 +ujson>=5.10.0 +unstructured>=0.18.1 +unstructured-client>=0.37.2 +urllib3>=2.5.0 +uvicorn>=0.49.0 +uvloop>=0.22.1 +validators>=0.35.0 +wandb>=0.20.1 +watchfiles>=1.2.0 +websockets>=14.2 +wrapt>=1.17.2 +yarl>=1.20.1 +zipp>=3.23.0 + +# Agentic engine (v2.0) — MCP + tigergraph-mcp for in-process, per-user +# tool execution. Requires the fastapi/starlette bump above (mcp pulls +# starlette>=0.49). +mcp>=1.27.1 +tigergraph-mcp>=1.0.1 +sse-starlette>=3.4.4 +httpx-sse>=0.4.3 +pydantic-settings>=2.14.1 +jsonschema>=4.26.0 +jsonschema-specifications>=2025.9.1 +referencing>=0.37.0 +rpds-py>=0.30.0 +PyJWT>=2.13.0 +annotated-doc>=0.0.4 +typing-inspection>=0.4.2 diff --git a/graphrag/Dockerfile b/graphrag/Dockerfile index d46cd6d..4e75d17 100644 --- a/graphrag/Dockerfile +++ b/graphrag/Dockerfile @@ -19,4 +19,4 @@ ENV SERVER_CONFIG="/server_config.json" ENV LOGLEVEL="INFO" EXPOSE 8000 -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--ws", "websockets-sansio"] diff --git a/graphrag/app/agent.py b/graphrag/app/agent.py deleted file mode 100644 index f11000c..0000000 --- a/graphrag/app/agent.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) 2024-2026 TigerGraph, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import time -from langchain.agents import AgentType, initialize_agent -from typing import List, Union -import logging - -from pyTigerGraph import TigerGraphConnection - -from agent.agent_graph import TigerGraphAgentGraph -from tools import GenerateFunction, MapQuestionToSchema - -from common.embeddings.embedding_services import EmbeddingModel -from common.embeddings.base_embedding_store import EmbeddingStore -from common.metrics.prometheus_metrics import metrics -from common.metrics.tg_proxy import TigerGraphConnectionProxy -from common.llm_services.base_llm import LLM_Model - -from common.logs.log import req_id_cv -from common.logs.logwriter import LogWriter - -from typing_extensions import TypedDict - -logger = logging.getLogger(__name__) - - - -class TigerGraphAgent: - """TigerGraph Agent Class - - The TigerGraph Agent Class combines the various dependencies needed for a AI Agent to reason with data in a TigerGraph database. - - Args: - llm_provider (LLM_Model): - a LLM_Model class that connects to an external LLM API service. - db_connection (TigerGraphConnection): - a PyTigerGraph TigerGraphConnection object instantiated to interact with the desired database/graph and authenticated with correct roles. - embedding_model (EmbeddingModel): - a EmbeddingModel class that connects to an external embedding API service. - embedding_store (EmbeddingStore): - a EmbeddingStore class that connects to an embedding store to retrieve pyTigerGraph and custom query documentation from. - """ - - def __init__( - self, - llm_provider: LLM_Model, - db_connection: TigerGraphConnectionProxy, - embedding_model: EmbeddingModel, - embedding_store: EmbeddingStore, - ): - self.conn = db_connection - - self.llm = llm_provider - self.model_name = embedding_model.model_name - self.embedding_model = embedding_model - self.embedding_store = embedding_store - - self.mq2s = MapQuestionToSchema( - self.conn, self.llm - ) - self.gen_func = GenerateFunction( - self.conn, - self.llm, - embedding_model, - embedding_store, - ) - - self.agent = TigerGraphAgentGraph( - self.llm, self.conn, self.embedding_model, self.embedding_store, self.mq2s, self.gen_func - ).create_graph() - - - logger.debug(f"request_id={req_id_cv.get()} agent initialized") - - def question_for_agent(self, question: str): - """Question for Agent. - - Ask the agent a question to be answered by the database. Returns the agent resoposne or raises an exception. - - Args: - question (str): - The question to ask the agent - """ - start_time = time.time() - metrics.llm_inprogress_requests.labels(self.model_name).inc() - - try: - LogWriter.info(f"request_id={req_id_cv.get()} ENTRY question_for_agent") - logger.debug_pii( - f"request_id={req_id_cv.get()} question_for_agent question={question}" - ) - - for output in self.agent.stream({"question": question}): - for key, value in output.items(): - LogWriter.info(f"request_id={req_id_cv.get()} executed node {key}") - - LogWriter.info(f"request_id={req_id_cv.get()} EXIT question_for_agent") - return value["answer"] - except Exception as e: - metrics.llm_query_error_total.labels(self.model_name).inc() - LogWriter.error(f"request_id={req_id_cv.get()} FAILURE question_for_agent") - import traceback - - traceback.print_exc() - raise e - finally: - metrics.llm_request_total.labels(self.model_name).inc() - metrics.llm_inprogress_requests.labels(self.model_name).dec() - duration = time.time() - start_time - metrics.llm_request_duration_seconds.labels(self.model_name).observe( - duration - ) diff --git a/graphrag/app/agent/agent_generation.py b/graphrag/app/agent/agent_generation.py index 22d10d4..1889180 100644 --- a/graphrag/app/agent/agent_generation.py +++ b/graphrag/app/agent/agent_generation.py @@ -14,7 +14,7 @@ import json import logging -from langchain.prompts import PromptTemplate +from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import PydanticOutputParser from typing import Optional from pydantic import BaseModel, Field diff --git a/graphrag/app/agent/agent_graph.py b/graphrag/app/agent/agent_graph.py index 1c2925e..384764d 100644 --- a/graphrag/app/agent/agent_graph.py +++ b/graphrag/app/agent/agent_graph.py @@ -31,7 +31,7 @@ has_insufficient_context, ) from agent.Q import DONE, Q -from langchain.prompts import PromptTemplate +from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langgraph.graph import END, StateGraph from pyTigerGraph.common.exception import TigerGraphException diff --git a/graphrag/app/agent/agent_hallucination_check.py b/graphrag/app/agent/agent_hallucination_check.py index c51d2b4..287d7df 100644 --- a/graphrag/app/agent/agent_hallucination_check.py +++ b/graphrag/app/agent/agent_hallucination_check.py @@ -1,5 +1,5 @@ import logging -from langchain.prompts import PromptTemplate +from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field diff --git a/graphrag/app/agent/agent_rewrite.py b/graphrag/app/agent/agent_rewrite.py index 4feda43..39aed75 100644 --- a/graphrag/app/agent/agent_rewrite.py +++ b/graphrag/app/agent/agent_rewrite.py @@ -1,6 +1,6 @@ import logging -from langchain.prompts import PromptTemplate +from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field diff --git a/graphrag/app/agent/agent_router.py b/graphrag/app/agent/agent_router.py index 7668727..4bcb214 100644 --- a/graphrag/app/agent/agent_router.py +++ b/graphrag/app/agent/agent_router.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from langchain.prompts import PromptTemplate +from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field diff --git a/graphrag/app/agent/agent_usefulness_check.py b/graphrag/app/agent/agent_usefulness_check.py index fe836f9..3522cef 100644 --- a/graphrag/app/agent/agent_usefulness_check.py +++ b/graphrag/app/agent/agent_usefulness_check.py @@ -1,4 +1,4 @@ -from langchain.prompts import PromptTemplate +from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field diff --git a/graphrag/app/agent/method_selector.py b/graphrag/app/agent/method_selector.py index 724c735..d12fbc1 100644 --- a/graphrag/app/agent/method_selector.py +++ b/graphrag/app/agent/method_selector.py @@ -26,7 +26,7 @@ import logging from typing import Literal, Optional -from langchain.prompts import PromptTemplate +from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field from pyTigerGraph.pyTigerGraph import TigerGraphConnection diff --git a/graphrag/app/supportai/supportai_ingest.py b/graphrag/app/supportai/supportai_ingest.py index ae19697..4a64741 100644 --- a/graphrag/app/supportai/supportai_ingest.py +++ b/graphrag/app/supportai/supportai_ingest.py @@ -11,8 +11,8 @@ from common.status import Status, IngestionProgress from common.extractors import LLMEntityRelationshipExtractor -from langchain.prompts import ChatPromptTemplate -from langchain.output_parsers import PydanticOutputParser +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.output_parsers import PydanticOutputParser logger = logging.getLogger(__name__) diff --git a/graphrag/app/tools/generate_cypher.py b/graphrag/app/tools/generate_cypher.py index c1a1afc..853a8a2 100644 --- a/graphrag/app/tools/generate_cypher.py +++ b/graphrag/app/tools/generate_cypher.py @@ -15,9 +15,9 @@ import logging from typing import Iterable from langchain_core.output_parsers import StrOutputParser -from langchain.prompts import PromptTemplate -from langchain.tools import BaseTool -from langchain.llms.base import LLM +from langchain_core.prompts import PromptTemplate +from langchain_core.tools import BaseTool +from langchain_core.language_models.llms import LLM from common.metrics.tg_proxy import TigerGraphConnectionProxy from common.db.connections import get_schema_ver from common.db.schema_utils import render_schema_rep diff --git a/graphrag/app/tools/generate_function.py b/graphrag/app/tools/generate_function.py index 538ec9c..7061aeb 100644 --- a/graphrag/app/tools/generate_function.py +++ b/graphrag/app/tools/generate_function.py @@ -16,11 +16,11 @@ import logging from typing import Dict, List, Optional, Type, Union -from langchain.llms.base import LLM +from langchain_core.language_models.llms import LLM from langchain_core.output_parsers import PydanticOutputParser -from langchain.prompts import PromptTemplate -from langchain.tools import BaseTool -from langchain.tools.base import ToolException +from langchain_core.prompts import PromptTemplate +from langchain_core.tools import BaseTool +from langchain_core.tools import ToolException from common.embeddings.base_embedding_store import EmbeddingStore from common.embeddings.embedding_services import EmbeddingModel diff --git a/graphrag/app/tools/generate_gsql.py b/graphrag/app/tools/generate_gsql.py index 05a8017..bd05f1a 100644 --- a/graphrag/app/tools/generate_gsql.py +++ b/graphrag/app/tools/generate_gsql.py @@ -15,9 +15,9 @@ import logging from typing import Iterable from langchain_core.output_parsers import StrOutputParser -from langchain.prompts import PromptTemplate -from langchain.tools import BaseTool -from langchain.llms.base import LLM +from langchain_core.prompts import PromptTemplate +from langchain_core.tools import BaseTool +from langchain_core.language_models.llms import LLM from common.metrics.tg_proxy import TigerGraphConnectionProxy from common.db.connections import get_schema_ver from common.db.schema_utils import render_schema_rep diff --git a/graphrag/app/tools/map_question_to_schema.py b/graphrag/app/tools/map_question_to_schema.py index bde9f3d..d9d8bfa 100644 --- a/graphrag/app/tools/map_question_to_schema.py +++ b/graphrag/app/tools/map_question_to_schema.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from langchain.tools import BaseTool -from langchain.tools.base import ToolException -from langchain.llms.base import LLM -from langchain.prompts import PromptTemplate +from langchain_core.tools import BaseTool +from langchain_core.tools import ToolException +from langchain_core.language_models.llms import LLM +from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import PydanticOutputParser from common.metrics.tg_proxy import TigerGraphConnectionProxy diff --git a/graphrag/tests/test_connections.py b/graphrag/tests/test_connections.py index 40fdbb8..cb795d6 100644 --- a/graphrag/tests/test_connections.py +++ b/graphrag/tests/test_connections.py @@ -20,9 +20,8 @@ "common.llm_services", "common.session", "common.status", - "langchain", - "langchain.schema", - "langchain.schema.embeddings", + "langchain_core", + "langchain_core.embeddings", "prometheus_client", ]: if mod_name not in sys.modules: diff --git a/graphrag/tests/test_service.py b/graphrag/tests/test_service.py index b418f06..163f3d2 100644 --- a/graphrag/tests/test_service.py +++ b/graphrag/tests/test_service.py @@ -2,9 +2,10 @@ import os from fastapi.testclient import TestClient import json +import re import wandb -from langchain.evaluation import load_evaluator -from langchain.chat_models import ChatOpenAI +from langchain_openai import ChatOpenAI +from rapidfuzz.distance import JaroWinkler import time from pygit2 import Repository, Commit @@ -13,6 +14,36 @@ EPS = 0.001 +def _string_distance(prediction, reference) -> float: + """Normalized Jaro-Winkler distance in [0, 1] (0 = identical). + + Replaces the LangChain ``string_distance`` evaluator, which is no longer + available after the move off the top-level ``langchain`` package. + """ + return JaroWinkler.normalized_distance(str(prediction), str(reference)) + + +def _labeled_score(llm, prediction, reference, question) -> int: + """LLM-graded match of a submitted answer against a reference, 1-10. + + Replaces the LangChain ``labeled_score_string`` evaluator. Returns 0 when + no integer rating can be parsed from the model response. + """ + grading_prompt = ( + "You are grading a submitted answer against a reference answer.\n" + f"[Question]: {question}\n" + f"[Reference answer]: {reference}\n" + f"[Submitted answer]: {prediction}\n\n" + "On a scale from 1 to 10, rate how well the submitted answer matches " + "the reference answer in correctness and completeness. " + "Respond with only the integer rating." + ) + resp = llm.invoke(grading_prompt) + text = getattr(resp, "content", resp) + match = re.search(r"\d+", str(text)) + return int(match.group()) if match else 0 + + class CommonTests: @classmethod def setUpClass(cls, schema="all", use_wandb=True): @@ -75,7 +106,6 @@ def json_are_equal(obj1, obj2, epsilon=EPS): ) t2 = time.time() self.assertEqual(resp.status_code, 200) - evaluator = load_evaluator("string_distance") try: answer = resp.json()["query_sources"]["result"] query_source = resp.json()["query_sources"]["function_call"] @@ -86,9 +116,7 @@ def json_are_equal(obj1, obj2, epsilon=EPS): question_answered = resp.json()["answered_question"] correct = False if isinstance(answer, str): - string_dist = evaluator.evaluate_strings( - prediction=answer, reference=true_answer - )["score"] + string_dist = _string_distance(answer, true_answer) if string_dist <= 0.2: correct = True elif isinstance(answer, list): @@ -122,19 +150,18 @@ def json_are_equal(obj1, obj2, epsilon=EPS): fp.close() llm = ChatOpenAI(**test_llm_config) - evaluator = load_evaluator("labeled_score_string", llm=llm) - - eval_result = evaluator.evaluate_strings( + score = _labeled_score( + llm, prediction=str(answer) + " answered by this function call: " + str(query_source), reference=str(true_answer) + " answered by this function call: " + str(function_call), - input=question, + question=question, ) - if eval_result["score"] >= 7: + if score >= 7: correct = True if self.USE_WANDB: From 9e05e45ccdfc81c5154eaae6d22582276d7b2528 Mon Sep 17 00:00:00 2001 From: Chengbiao Jin Date: Wed, 24 Jun 2026 13:58:55 -0700 Subject: [PATCH 02/33] Add structured and auto-detecting document chunkers - Add a structured chunker that preserves table and section layout, plus an auto chunker that picks a strategy from the document type. - Improve extracted-PDF text quality by repairing mojibake, vertical CJK runs, and broken table rows. - Bundle third-party license attribution for the structured-document parser. Refs: GML-2121, GML-2081 --- common/chunkers/__init__.py | 4 +- common/chunkers/auto.py | 118 ++++ common/chunkers/structured.py | 994 ++++++++++++++++++++++++++++++ common/utils/text_extractors.py | 212 ++++++- ecc/tests/README_chunkers.md | 165 +++++ ecc/tests/test_chunkers.py | 357 +++++++++++ ecc/tests/test_chunkers_demo.py | 198 ++++++ ecc/tests/test_chunkers_direct.py | 273 ++++++++ ecc/tests/test_chunkers_simple.py | 317 ++++++++++ licenses/README.md | 94 +++ licenses/docling-MIT | 21 + 11 files changed, 2730 insertions(+), 23 deletions(-) create mode 100644 common/chunkers/auto.py create mode 100644 common/chunkers/structured.py create mode 100644 ecc/tests/README_chunkers.md create mode 100644 ecc/tests/test_chunkers.py create mode 100644 ecc/tests/test_chunkers_demo.py create mode 100644 ecc/tests/test_chunkers_direct.py create mode 100644 ecc/tests/test_chunkers_simple.py create mode 100644 licenses/README.md create mode 100644 licenses/docling-MIT diff --git a/common/chunkers/__init__.py b/common/chunkers/__init__.py index d08ab60..1d8fa1d 100644 --- a/common/chunkers/__init__.py +++ b/common/chunkers/__init__.py @@ -5,4 +5,6 @@ from .regex_chunker import RegexChunker from .semantic_chunker import SemanticChunker from .recursive_chunker import RecursiveChunker -from .single_chunker import SingleChunker \ No newline at end of file +from .single_chunker import SingleChunker +from .structured import StructuredChunker, StructuredChunk +from .auto import AutoChunker, auto_detect_kind \ No newline at end of file diff --git a/common/chunkers/auto.py b/common/chunkers/auto.py new file mode 100644 index 0000000..5ee0140 --- /dev/null +++ b/common/chunkers/auto.py @@ -0,0 +1,118 @@ +# Copyright (c) 2024-2026 TigerGraph, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Content-aware chunker dispatcher. + +When ``graphrag_config.chunker = "auto"`` is set on a graph, the ECC +worker instantiates an ``AutoChunker``. For each document passed to +``chunk()``, the dispatcher inspects the content's structural density +and delegates to the most appropriate concrete chunker: + + - HTML tags present (````, ````, ````, ``

``…) + → ``structured`` chunker (HTML-aware atomic blocks, heading folding) + + - Markdown structure present (multiple ``|...|`` tables, several + ``![alt](url)`` figures, embedded ```` markers from + pymupdf4llm) → ``structured`` chunker + + - Several markdown headings but no table / figure / page signals + → ``markdown`` chunker (heading-aware section splitter) + + - No structure signals → ``semantic`` chunker (LLM-embedding-based + coherent splitting) + +Delegate chunkers are lazily instantiated and cached, so a graph +ingesting 50 markdown documents only instantiates one ``StructuredChunker``. +""" + +from __future__ import annotations + +import re +from typing import Callable, Dict + +from common.chunkers.base_chunker import BaseChunker + + +# Heuristic thresholds — tuned for typical document corpora. +_SAMPLE_BYTES = 8 * 1024 # how much of the doc to inspect +_TABLE_LINE_MIN = 3 # `|...|` lines to trigger structured +_FIGURE_LINE_MIN = 3 # `![alt](url)` lines to trigger structured +_HEADING_LINE_MIN_FOR_MD = 3 # markdown headings to trigger markdown chunker +_PAGE_MARKER_MIN = 2 # `` markers to trigger structured + +_HTML_INDICATORS = ( + "", "", + "

", "

", "

", "

", "") + + +def auto_detect_kind(content: str) -> str: + """Return the chunker name best matched to ``content``.""" + if not content: + return "single" + sample = content[:_SAMPLE_BYTES] + + # HTML — even a small fragment is a strong signal. + lowered = sample.lower() + if any(tag in lowered for tag in _HTML_INDICATORS): + return "structured" + + # Density signals on the markdown-shaped path. + lines = sample.split("\n") + table_lines = sum(1 for l in lines if _TABLE_LINE_RE.match(l)) + figure_lines = sum(1 for l in lines if _FIGURE_LINE_RE.search(l)) + heading_lines = sum(1 for l in lines if _HEADING_LINE_RE.match(l)) + page_markers = len(_PAGE_MARKER_RE.findall(sample)) + + has_atomic_structure = ( + table_lines >= _TABLE_LINE_MIN + or figure_lines >= _FIGURE_LINE_MIN + or page_markers >= _PAGE_MARKER_MIN + ) + if has_atomic_structure: + return "structured" + if heading_lines >= _HEADING_LINE_MIN_FOR_MD: + return "markdown" + return "semantic" + + +class AutoChunker(BaseChunker): + """Dispatches to a concrete chunker per document. + + ``factory`` is a callable that produces a concrete chunker given a + kind string (``"structured"`` / ``"markdown"`` / ``"semantic"`` / + ``"single"``). The factory is normally a thin wrapper around + ``ecc_util.get_chunker`` that closes over the per-graph config. + + Each unique kind is instantiated at most once per ECC pass and + cached, so a graph with many same-shaped documents reuses one + delegate instance. + """ + + def __init__(self, factory: Callable[[str], BaseChunker]): + self._factory = factory + self._cache: Dict[str, BaseChunker] = {} + + def _delegate(self, kind: str) -> BaseChunker: + if kind not in self._cache: + self._cache[kind] = self._factory(kind) + return self._cache[kind] + + def chunk(self, content: str): + kind = auto_detect_kind(content) + return self._delegate(kind).chunk(content) diff --git a/common/chunkers/structured.py b/common/chunkers/structured.py new file mode 100644 index 0000000..9337aa5 --- /dev/null +++ b/common/chunkers/structured.py @@ -0,0 +1,994 @@ +# Copyright (c) 2024-2026 TigerGraph, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Page- and structure-aware chunker (v2.0 — GML-2121). + +Replaces char-count slicing for PDF and HTML ingest with an atomic-unit +chunker that respects markdown / HTML structure: + +- Tables (``|...|`` in markdown; ``

`` in HTML) are never split mid-row. +- Figures (``![alt](url)`` in markdown; ``
`` / ```` in HTML) keep + their caption. +- Lists (``
    `` / ``
      `` / ``
      ``) stay atomic up to a size threshold; + larger lists split at ``
    • `` boundaries with each subset still atomic. +- Code blocks (fenced markdown; ``
      `` / ````) stay whole.
      +- Prose paragraphs char-split as today, bounded by ``chunk_size``.
      +
      +The chunker is format-agnostic. Markdown and HTML inputs both reduce to a
      +uniform ``Element`` stream; a single ``pack`` step turns that stream into
      +``StructuredChunk`` instances (a ``str`` subclass — drop-in for existing
      +consumers that pass chunk text to embedding / entity extraction, with
      +metadata accessible via attributes for newer consumers).
      +"""
      +
      +from __future__ import annotations
      +
      +import logging
      +import re
      +from dataclasses import dataclass, field
      +from typing import Iterable, List, Literal, Optional, Tuple
      +
      +from common.chunkers.base_chunker import BaseChunker
      +from common.chunkers.separators import TEXT_SEPARATORS
      +
      +logger = logging.getLogger(__name__)
      +
      +
      +_DEFAULT_CHUNK_SIZE = 2048
      +_DEFAULT_OVERLAP_DIV = 8  # overlap defaults to chunk_size / 8 to match other chunkers
      +
      +
      +# --- public chunk type ------------------------------------------------------
      +
      +ChunkKind = Literal["prose", "table", "figure", "code", "list", "heading", "mixed"]
      +
      +
      +class StructuredChunk(str):
      +    """A chunk that behaves like ``str`` but carries structure metadata.
      +
      +    Subclassing ``str`` keeps existing consumers (embedding, entity
      +    extraction, GSQL upserts) working unchanged — they see a string. New
      +    consumers read ``chunk_kind`` / ``page_no`` / ``under_heading`` /
      +    ``continues_from_page`` / ``continues_to_page`` via attributes.
      +    """
      +
      +    chunk_kind: ChunkKind
      +    page_no: Optional[int]
      +    under_heading: Optional[str]
      +    continues_from_page: Optional[int]
      +    continues_to_page: Optional[int]
      +
      +    def __new__(
      +        cls,
      +        text: str,
      +        *,
      +        chunk_kind: ChunkKind = "prose",
      +        page_no: Optional[int] = None,
      +        under_heading: Optional[str] = None,
      +        continues_from_page: Optional[int] = None,
      +        continues_to_page: Optional[int] = None,
      +    ) -> "StructuredChunk":
      +        instance = super().__new__(cls, text)
      +        instance.chunk_kind = chunk_kind
      +        instance.page_no = page_no
      +        instance.under_heading = under_heading
      +        instance.continues_from_page = continues_from_page
      +        instance.continues_to_page = continues_to_page
      +        return instance
      +
      +    def metadata(self) -> dict:
      +        return {
      +            "chunk_kind": self.chunk_kind,
      +            "page_no": self.page_no,
      +            "under_heading": self.under_heading,
      +            "continues_from_page": self.continues_from_page,
      +            "continues_to_page": self.continues_to_page,
      +        }
      +
      +
      +# --- internal element type --------------------------------------------------
      +
      +ElementKind = Literal["prose", "table", "figure", "code", "list", "heading"]
      +
      +
      +@dataclass
      +class Element:
      +    """One typed unit extracted from a markdown or HTML source.
      +
      +    Atomic kinds (``table``, ``figure``, ``code``, ``list``) are never
      +    split below this granularity by the packer. ``heading`` elements are
      +    promoted to the ``heading`` field of subsequent elements so each
      +    packed chunk carries the most-recent section title.
      +    """
      +    kind: ElementKind
      +    text: str
      +    heading: Optional[str] = None       # most recent heading text
      +    page: Optional[int] = None          # PDF only — present when source has page metadata
      +    # For lists too long to keep atomic: pre-split sub-items the packer
      +    # can re-pack while keeping each subset atomic at ``
    • `` boundaries. + splittable_items: Optional[List[str]] = field(default=None, repr=False) + + +# --- markdown adapter ------------------------------------------------------- + +# Pure markdown table: a line starting with `|` and at least one more `|`. +_MD_TABLE_LINE = re.compile(r"^\s*\|.*\|\s*$") +# Markdown image / figure reference. +_MD_IMG_LINE = re.compile(r"^\s*!\[.*?\]\(.*?\)\s*$") +# Fenced code block delimiter. +_MD_CODE_FENCE = re.compile(r"^\s*```") +# Markdown heading line. +_MD_HEADING = re.compile(r"^\s*(#{1,6})\s+(.+?)\s*$") +# An HTML comment from pymupdf4llm chunk markers — informational only. +_MD_HTML_COMMENT = re.compile(r"^\s*\s*$") +# Page marker emitted by the PDF text extractor (see common/utils/text_extractors.py). +# Lines matching this update the "current page" for following elements without +# emitting an element themselves. +_MD_PAGE_MARKER = re.compile(r"^\s*\s*$") +# pymupdf4llm artifacts: +# • "==> picture [WxH] intentionally omitted <==" — image dropped (skip line) +# • "----- Start of picture text -----" / "----- End of picture text -----" +# bracket OCR'd content inside an image; we fold the body into the figure +# so chart-internal labels stay with the image chunk. +_MD_PICTURE_OMITTED = re.compile(r"^\s*\*+\s*==>\s*picture\b.*intentionally omitted\s*<==\s*\*+.*$", re.IGNORECASE) +_MD_PICTURE_TEXT_START = re.compile(r"^\s*\*+\s*-+\s*Start of picture text\s*-+\s*\*+\s*()?\s*$", re.IGNORECASE) +_MD_PICTURE_TEXT_END = re.compile(r"^\s*\*+\s*-+\s*End of picture text\s*-+\s*\*+\s*()?\s*$", re.IGNORECASE) + + +def _flush_prose(buf: List[str], heading: Optional[str], page: Optional[int], out: List[Element]) -> None: + if not buf: + return + text = "\n".join(buf).strip() + if text: + out.append(Element(kind="prose", text=text, heading=heading, page=page)) + buf.clear() + + +# A caption is a short single-or-double-line prose block that immediately +# precedes a table or figure with no blank line between them. We fold it +# into the atomic element so retrieval of "図表2 残高表(抜粋)" returns +# the table, not a sibling prose chunk. +_CAPTION_MAX_CHARS = 200 +_CAPTION_MAX_LINES = 2 + + +def _take_caption(buf: List[str]) -> Optional[str]: + """If ``buf`` looks like a caption (short, ≤2 lines), pop and return it. + Otherwise return None and leave ``buf`` untouched. + + Handles the no-blank-line case where the caption sits directly above + the table in the source: + + 図表2 残高表(抜粋) + |...|...| + + The blank-line case (pymupdf4llm typically emits this shape) is + handled by ``_take_caption_from_out`` instead. + """ + if not buf: + return None + if len(buf) > _CAPTION_MAX_LINES: + return None + joined = "\n".join(buf).strip() + if not joined or len(joined) > _CAPTION_MAX_CHARS: + return None + buf.clear() + return joined + + +def _take_caption_from_out(out: List[Element]) -> Optional[str]: + """If the most recently emitted element is a short prose block, pop + and return its text. Handles the blank-line case: + + 図表 7-2 都道府県別総預貯金額 ( 兆円 ) + ← blank line, prose flushed to ``out`` here + + |都道府県|... + + A heading or any non-prose immediately preceding the table blocks + the lookback (returns None), preserving the rule that a caption + above a section heading belongs to the section, not the next table. + """ + if not out or out[-1].kind != "prose": + return None + last = out[-1] + if len(last.text) > _CAPTION_MAX_CHARS: + return None + # Lines in the stored element text use single \n separators. + if last.text.count("\n") + 1 > _CAPTION_MAX_LINES: + return None + return out.pop().text + + +def markdown_to_elements(md: str, page: Optional[int] = None) -> List[Element]: + """Tokenize markdown into a stream of typed elements. + + Handles GFM-style tables (consecutive ``|...|`` rows), fenced code + blocks, image lines, headings, and prose paragraphs separated by + blank lines. HTML comments are dropped (pymupdf4llm leaves chunk + markers in some flows). + """ + out: List[Element] = [] + heading: Optional[str] = None + prose_buf: List[str] = [] + + lines = md.splitlines() + i = 0 + while i < len(lines): + line = lines[i] + stripped = line.strip() + + # 1. Heading line. + m = _MD_HEADING.match(line) + if m: + _flush_prose(prose_buf, heading, page, out) + heading = m.group(2).strip() + out.append(Element(kind="heading", text=heading, heading=heading, page=page)) + i += 1 + continue + + # 2. Fenced code block — collect until matching fence. + if _MD_CODE_FENCE.match(line): + _flush_prose(prose_buf, heading, page, out) + block = [line] + i += 1 + while i < len(lines): + block.append(lines[i]) + if _MD_CODE_FENCE.match(lines[i]): + i += 1 + break + i += 1 + out.append(Element(kind="code", text="\n".join(block), heading=heading, page=page)) + continue + + # 3. Standalone image (figure) line. + if _MD_IMG_LINE.match(line): + caption = _take_caption(prose_buf) + _flush_prose(prose_buf, heading, page, out) + if caption is None: + caption = _take_caption_from_out(out) + body = line.strip() + if caption: + body = f"{caption}\n\n{body}" + out.append(Element(kind="figure", text=body, heading=heading, page=page)) + i += 1 + continue + + # 4. Markdown table — collect contiguous `|...|` lines, folding any + # short prose line immediately before it as the caption (e.g. + # "図表2 2011年3月末の資金循環統計の残高表(抜粋)" — the + # caption must travel with the table or retrieval misses it). + # The caption may sit directly above the table (in prose_buf) + # OR be separated by a blank line (already flushed to ``out``); + # we check both locations in that order. + if _MD_TABLE_LINE.match(line): + caption = _take_caption(prose_buf) + _flush_prose(prose_buf, heading, page, out) + if caption is None: + caption = _take_caption_from_out(out) + block = [line] + i += 1 + while i < len(lines) and _MD_TABLE_LINE.match(lines[i]): + block.append(lines[i]) + i += 1 + body = "\n".join(block) + if caption: + body = f"{caption}\n\n{body}" + out.append(Element(kind="table", text=body, heading=heading, page=page)) + continue + + # 5a. Page marker — updates current page for following elements. + pm = _MD_PAGE_MARKER.match(line) + if pm: + _flush_prose(prose_buf, heading, page, out) + try: + page = int(pm.group(1)) + except ValueError: + pass + i += 1 + continue + + # 5b. Other HTML comments (chunk markers etc.) — skip. + if _MD_HTML_COMMENT.match(line): + i += 1 + continue + + # 5c. pymupdf4llm "==> picture ... intentionally omitted <==" — drop. + if _MD_PICTURE_OMITTED.match(line): + i += 1 + continue + + # 5d. pymupdf4llm picture-text block: ----- Start ... End of picture + # text ----- wraps OCR'd content (chart axis labels, legends). + # Fold the body into the immediately preceding figure when + # present so chart-internal text travels with the image. + if _MD_PICTURE_TEXT_START.match(line): + _flush_prose(prose_buf, heading, page, out) + i += 1 + block: List[str] = [] + while i < len(lines) and not _MD_PICTURE_TEXT_END.match(lines[i]): + block.append(lines[i]) + i += 1 + if i < len(lines): + i += 1 # skip the End marker + # Inline
      tags become line breaks for readability. + body = "\n".join(block) + body = re.sub(r"", "\n", body, flags=re.IGNORECASE).strip() + if not body: + continue + if out and out[-1].kind == "figure": + out[-1].text = f"{out[-1].text}\n\n{body}" + else: + # No preceding figure — emit as a standalone figure element + # (treating the OCR'd image content as a figure with no URL). + out.append(Element(kind="figure", text=body, heading=heading, page=page)) + continue + + # 6. Blank line — flush current prose paragraph. + if not stripped: + _flush_prose(prose_buf, heading, page, out) + i += 1 + continue + + # 7. Default: accumulate as prose. + prose_buf.append(line) + i += 1 + + _flush_prose(prose_buf, heading, page, out) + return out + + +def markdown_pages_to_elements(pages: Iterable[dict]) -> List[Element]: + """Convert ``pymupdf4llm.to_markdown(..., page_chunks=True)`` output + (a list of per-page dicts) into a flat element stream with each + element carrying its ``page`` number. + + pymupdf4llm exposes the page index under ``metadata.page_number`` + (1-based). ``metadata.page`` is a filename-style label and may be + absent, so we check both keys. + """ + out: List[Element] = [] + for p in pages or []: + page_no = None + md = p.get("text") or "" + meta = p.get("metadata") or {} + for key in ("page_number", "page"): + if key in meta: + try: + page_no = int(meta[key]) + break + except (TypeError, ValueError): + page_no = None + out.extend(markdown_to_elements(md, page=page_no)) + return out + + +# --- html adapter ----------------------------------------------------------- + +_HTML_ATOMIC = {"table", "pre", "ol", "ul", "dl", "figure", "blockquote"} +_HTML_PROSE = {"p"} +_HTML_HEADS = {f"h{i}" for i in range(1, 7)} +_HTML_SKIP = {"script", "style", "noscript", "meta", "link", "head"} + + +def html_to_elements(html: str) -> List[Element]: + """Walk an HTML document (or fragment) and emit a typed element + stream. See the design notes on GML-2121 for the tag classification. + """ + try: + from bs4 import BeautifulSoup, NavigableString + except ImportError as exc: # pragma: no cover — bs4 is a runtime dep + raise RuntimeError("structured chunker (HTML) requires beautifulsoup4") from exc + + soup = BeautifulSoup(html, "html.parser") + out: List[Element] = [] + root = soup.body or soup + _walk_html(root, out, heading=None, NavigableString=NavigableString) + return out + + +def _walk_html(node, out: List[Element], heading: Optional[str], NavigableString) -> None: + # Local import-bound NavigableString avoids re-importing in every recursive call. + for child in getattr(node, "children", []): + if isinstance(child, NavigableString): + text = str(child).strip() + if text: + out.append(Element(kind="prose", text=text, heading=heading)) + continue + tag = (child.name or "").lower() + if not tag or tag in _HTML_SKIP: + continue + if tag in _HTML_HEADS: + heading = child.get_text(strip=True) + if heading: + out.append(Element(kind="heading", text=heading, heading=heading)) + continue + if tag in _HTML_ATOMIC: + # Tables / blockquotes / code / figures stay atomic with their HTML preserved. + # Lists carry splittable_items so the packer can re-pack at
    • when too long. + if tag in {"ol", "ul", "dl"}: + # Collect every direct block-level child as a splittable unit + # (nested
        /
          /
/

, not just

  • ). + items: List[str] = [] + for c in child.children: + if isinstance(c, NavigableString): + t = str(c).strip() + if t: + items.append(t) + continue + cname = (c.name or "").lower() + if not cname or cname in _HTML_SKIP: + continue + items.append(str(c)) + out.append(Element( + kind="list", + text=str(child), + heading=heading, + splittable_items=items or None, + )) + elif tag == "table": + out.append(Element(kind="table", text=str(child), heading=heading)) + elif tag == "blockquote": + # Blockquote is prose-shaped but we keep it atomic. + out.append(Element( + kind="prose", + text=child.get_text(separator=" ", strip=True), + heading=heading, + )) + elif tag == "figure": + out.append(Element(kind="figure", text=str(child), heading=heading)) + else: + out.append(Element(kind="code", text=str(child), heading=heading)) + continue + if tag in _HTML_PROSE: + text = child.get_text(separator=" ", strip=True) + if text: + out.append(Element(kind="prose", text=text, heading=heading)) + continue + # Standalone outside a
    . + if tag == "img": + alt = (child.get("alt") or "").strip() + src = (child.get("src") or "").strip() + label = f'![{alt}]({src})' if src else alt + if label: + out.append(Element(kind="figure", text=label, heading=heading)) + continue + # walk-into:
    ,
    ,
    ,
    ,
  • ", re.IGNORECASE) + + +def _split_table_at_rows( + text: str, + hard_cap: int, +) -> List[str]: + """Split an HTML table at ```` boundaries, preserving the table + envelope and the header row(s) on every emitted piece. + + Strategy: locate the outermost ````…``
    ``. The first + one or two ```` blocks are treated as headers (kept on every + piece). Remaining body rows are packed greedily into pieces of at + most ``hard_cap`` chars. Each piece is wrapped as + ``{headers}{body_rows}
    ``. + + Falls back to plain char-split when no ```` boundaries are + found (e.g. the table is a single huge cell or the markup is + non-standard). + """ + open_match = _TABLE_OPEN_RE.search(text) + close_match = _TABLE_CLOSE_RE.search(text) + if not open_match or not close_match or close_match.start() < open_match.end(): + return [text] + + prefix = text[:open_match.start()] + open_tag = text[open_match.start():open_match.end()] + body = text[open_match.end():close_match.start()] + close_tag = text[close_match.start():close_match.end()] + suffix = text[close_match.end():] + + rows = _TR_BLOCK_RE.findall(body) + if len(rows) < 2: + return [text] # nothing to split at; let the caller char-split + + # Treat the first as the header. If the header is short and the + # second row contains , treat it as a continuation of the header. + header_count = 1 + if header_count < len(rows) and " structure. Fall back to char-split. + return [text] + + pieces: List[str] = [] + buf: List[str] = [] + buf_len = 0 + for row in body_rows: + rlen = len(row) + if buf and buf_len + rlen > row_budget: + pieces.append(prefix + open_tag + headers + "".join(buf) + close_tag + suffix) + buf = [row] + buf_len = rlen + else: + buf.append(row) + buf_len += rlen + if buf: + pieces.append(prefix + open_tag + headers + "".join(buf) + close_tag + suffix) + return pieces or [text] + + +def _split_list_at_items(text: str, hard_cap: int) -> List[str]: + """Split a long
      /
        at
      1. boundaries. Header (the opening +
          /
            + everything before the first
          • ) is preserved on each + piece, and each piece is closed properly. Falls back to char-split + when no
          • boundaries are found. + """ + li_blocks = re.findall(r"]*>.*?
          • ", text, re.IGNORECASE | re.DOTALL) + if len(li_blocks) < 2: + return [text] + # Find the wrapper open / close + wrap_open = re.search(r"<(?:ul|ol)\b[^>]*>", text, re.IGNORECASE) + wrap_close = re.search(r"", text, re.IGNORECASE) + if not wrap_open or not wrap_close or wrap_close.start() < wrap_open.end(): + return [text] + prefix = text[:wrap_open.start()] + open_tag = text[wrap_open.start():wrap_open.end()] + close_tag = text[wrap_close.start():wrap_close.end()] + suffix = text[wrap_close.end():] + + envelope = len(prefix) + len(open_tag) + len(close_tag) + len(suffix) + item_budget = hard_cap - envelope + if item_budget < 200: + return [text] + + pieces: List[str] = [] + buf: List[str] = [] + buf_len = 0 + for item in li_blocks: + ilen = len(item) + if buf and buf_len + ilen > item_budget: + pieces.append(prefix + open_tag + "".join(buf) + close_tag + suffix) + buf = [item] + buf_len = ilen + else: + buf.append(item) + buf_len += ilen + if buf: + pieces.append(prefix + open_tag + "".join(buf) + close_tag + suffix) + return pieces or [text] + + +def _split_atomic_oversized( + text: str, + kind: "ChunkKind", + page: Optional[int], + heading: Optional[str], + max_chars: int, + overlap: int, + hard_cap: int, +) -> List["StructuredChunk"]: + """Split an atomic block that exceeds the embedding cap. + + Dispatches by ``kind``: + * ``"table"`` — split at ```` boundaries via + :func:`_split_table_at_rows`, preserving the table envelope and + header row on every piece so each piece reads as a valid + sub-table for retrieval. + * ``"list"`` — split at ``
          • `` boundaries via + :func:`_split_list_at_items`, preserving the list wrapper. + * Other kinds (figure, code, prose) — fall back to the recursive + char splitter used for oversized prose. + + Returns one StructuredChunk per piece, all carrying the original + chunk_kind / page_no / under_heading. The caller is responsible for + appending these to the chunk stream. + """ + pieces: List[str] + if kind == "table": + pieces = _split_table_at_rows(text, hard_cap) + # If the table can't be row-split (no boundaries), fall back + # to char-split so we still respect the embedding cap. + if len(pieces) == 1 and len(pieces[0]) > hard_cap: + pieces = _split_prose(text, min(max_chars, hard_cap), overlap) + elif kind == "list": + pieces = _split_list_at_items(text, hard_cap) + if len(pieces) == 1 and len(pieces[0]) > hard_cap: + pieces = _split_prose(text, min(max_chars, hard_cap), overlap) + else: + pieces = _split_prose(text, min(max_chars, hard_cap), overlap) + return [ + StructuredChunk( + piece, + chunk_kind=kind, + page_no=page, + under_heading=heading, + ) + for piece in pieces + ] + + +def pack( + elements: List[Element], + max_chars: int = _DEFAULT_CHUNK_SIZE, + overlap: Optional[int] = None, +) -> List[StructuredChunk]: + """Convert a typed element stream into ``StructuredChunk`` instances. + + Rules: + - Atomic elements (table / figure / code / list) emit standalone chunks + with their ``kind`` preserved. A list element longer than ``max_chars`` + is re-packed at ``
          • `` boundaries via ``splittable_items``. + - **Prose paragraphs are also atomic** — a single paragraph is never + split mid-sentence regardless of size. Multiple short paragraphs + under the same heading get packed together up to ``max_chars``; + a paragraph larger than ``max_chars`` becomes one oversized chunk + (matches table behaviour). Safety valve: a paragraph larger than + ``max_chars * _PROSE_OVERSIZE_RATIO`` falls back to recursive char + splitting so we don't exceed the embedding model's context window. + - Headings annotate following chunks' ``under_heading`` but do not + themselves emit chunks. + - ``page`` from the source flows onto each emitted chunk; multi-page + atomic blocks (today: none — pymupdf4llm assigns one page per + element) get ``continues_from_page`` / ``continues_to_page`` set + via the page-tracking pass below. + """ + if overlap is None: + overlap = max(0, max_chars // _DEFAULT_OVERLAP_DIV) + oversize_threshold = max_chars * _PROSE_OVERSIZE_RATIO + + chunks: List[StructuredChunk] = [] + # prose_buf packs whole-paragraph Elements until adding the next one + # would exceed max_chars, then flushes. No element is ever split. + prose_buf: List[Element] = [] + prose_heading: Optional[str] = None + prose_len = 0 + + def flush_prose() -> None: + nonlocal prose_buf, prose_heading, prose_len + if not prose_buf: + return + text = "\n\n".join(e.text for e in prose_buf).strip() + if not text: + prose_buf, prose_len = [], 0 + return + pages = [e.page for e in prose_buf if e.page is not None] + first_page = pages[0] if pages else None + last_page = pages[-1] if pages else None + cont_from = first_page if (last_page is not None and first_page != last_page) else None + cont_to = last_page if (last_page is not None and first_page != last_page) else None + chunks.append(StructuredChunk( + text, + chunk_kind="prose", + page_no=first_page, + under_heading=prose_heading, + continues_from_page=cont_from, + continues_to_page=cont_to, + )) + prose_buf, prose_len = [], 0 + + def emit_oversized_prose(elem: Element) -> None: + """Safety valve: pathologically long single paragraph. Char-split + as a last resort and emit each piece as its own prose chunk.""" + for piece in _split_prose(elem.text, max_chars, overlap): + chunks.append(StructuredChunk( + piece, + chunk_kind="prose", + page_no=elem.page, + under_heading=elem.heading, + )) + + for elem in elements: + if elem.kind == "heading": + flush_prose() + prose_heading = elem.heading + # The heading itself does not become a chunk; following + # elements carry .heading via their Element fields, and + # prose_heading is the packer-side memo for chunk metadata. + continue + + if elem.kind in ("table", "figure", "code"): + flush_prose() + kind = _atomic_kind_for(elem) + if len(elem.text) > _ATOMIC_HARD_MAX_CHARS: + chunks.extend(_split_atomic_oversized( + elem.text, kind, elem.page, elem.heading, + max_chars, overlap, _ATOMIC_HARD_MAX_CHARS, + )) + else: + chunks.append(StructuredChunk( + elem.text, + chunk_kind=kind, + page_no=elem.page, + under_heading=elem.heading, + )) + continue + + if elem.kind == "list": + flush_prose() + if len(elem.text) <= max_chars or not elem.splittable_items: + if len(elem.text) > _ATOMIC_HARD_MAX_CHARS: + chunks.extend(_split_atomic_oversized( + elem.text, "list", elem.page, elem.heading, + max_chars, overlap, _ATOMIC_HARD_MAX_CHARS, + )) + else: + chunks.append(StructuredChunk( + elem.text, + chunk_kind="list", + page_no=elem.page, + under_heading=elem.heading, + )) + else: + for body in _pack_list_items(elem.splittable_items, max_chars): + if len(body) > _ATOMIC_HARD_MAX_CHARS: + chunks.extend(_split_atomic_oversized( + body, "list", elem.page, elem.heading, + max_chars, overlap, _ATOMIC_HARD_MAX_CHARS, + )) + else: + chunks.append(StructuredChunk( + body, + chunk_kind="list", + page_no=elem.page, + under_heading=elem.heading, + )) + continue + + # Prose: atomic paragraph packing. + # Different heading context → flush before adopting the new one. + if elem.heading != prose_heading and prose_buf: + flush_prose() + prose_heading = elem.heading + + elem_len = len(elem.text) + + # Pathologically long single paragraph → safety-valve fallback. + if elem_len > oversize_threshold: + flush_prose() + emit_oversized_prose(elem) + continue + + # Packing rule: if adding this paragraph would push the buffer + # past max_chars and the buffer is non-empty, flush first so each + # output chunk fits the target size. A single paragraph that + # alone exceeds max_chars is still emitted whole (atomic-prose). + if prose_buf and (prose_len + elem_len > max_chars): + flush_prose() + + prose_buf.append(elem) + prose_len += elem_len + + flush_prose() + + # Merge tiny adjacent chunks so heading-only and section-marker + # fragments ("目次", "《留意点》", "<7-1>", ...) don't pollute the + # embedding index. A chunk smaller than ``_MIN_CHUNK_CHARS`` is + # absorbed into a neighbor when: + # * the same ``chunk_kind`` (don't merge a table into prose), + # * the same ``under_heading`` (don't cross section boundaries), + # * the resulting merged chunk stays under ``max_chars``. + # Prefers merging tiny chunks into the previous chunk; falls back to + # the next chunk when the previous doesn't qualify. + chunks = _merge_tiny_chunks(chunks, max_chars=max_chars) + return chunks + + +_MIN_CHUNK_CHARS_RATIO = 0.5 # min size = max_chars * ratio + + +def _merge_tiny_chunks( + chunks: List[StructuredChunk], + max_chars: int, +) -> List[StructuredChunk]: + """Merge chunks smaller than ``max_chars * _MIN_CHUNK_CHARS_RATIO`` + into a neighbor when the merge keeps the result under ``max_chars`` + and the neighbor matches ``chunk_kind`` + ``under_heading``. + + Walks the chunk list once. For each chunk, checks whether it's + small enough to be merged; if so, absorbs into the previous chunk + when compatible, else into the next; else leaves it standalone. + """ + if not chunks: + return chunks + min_chars = int(max_chars * _MIN_CHUNK_CHARS_RATIO) + merged: List[StructuredChunk] = [] + pending: List[StructuredChunk] = list(chunks) + i = 0 + while i < len(pending): + c = pending[i] + if len(c) >= min_chars: + merged.append(c) + i += 1 + continue + # c is tiny — try to merge into the previous chunk first. + if merged and _can_merge(merged[-1], c, max_chars): + merged[-1] = _merge_pair(merged[-1], c) + i += 1 + continue + # else try to merge into the next chunk. + if i + 1 < len(pending) and _can_merge(c, pending[i + 1], max_chars): + pending[i + 1] = _merge_pair(c, pending[i + 1]) + i += 1 + continue + # No compatible neighbor — keep the tiny chunk standalone. + merged.append(c) + i += 1 + return merged + + +def _can_merge(a: StructuredChunk, b: StructuredChunk, max_chars: int) -> bool: + """Two chunks are mergeable when they share kind + heading and the + combined length fits ``max_chars``. We don't merge atomic kinds + (table / figure / code / list) into anything — those carry HTML + envelopes that can't be naively concatenated. + """ + if a.chunk_kind != b.chunk_kind: + return False + if a.chunk_kind in ("table", "figure", "code", "list"): + return False + if (a.under_heading or "") != (b.under_heading or ""): + return False + # +2 accounts for the "\n\n" joiner. + return len(a) + len(b) + 2 <= max_chars + + +def _merge_pair(a: StructuredChunk, b: StructuredChunk) -> StructuredChunk: + """Concatenate two compatible chunks. Page metadata: if both share a + page, keep it; otherwise mark continues_from / continues_to. + """ + text = (str(a).rstrip() + "\n\n" + str(b).lstrip()).strip() + same_page = a.page_no == b.page_no + return StructuredChunk( + text, + chunk_kind=a.chunk_kind, + page_no=a.page_no if same_page else a.page_no, + under_heading=a.under_heading, + continues_from_page=a.continues_from_page if same_page else a.page_no, + continues_to_page=a.continues_to_page if same_page else b.page_no, + ) + + +# --- chunker wrapper -------------------------------------------------------- + + +class StructuredChunker(BaseChunker): + """Structure-aware chunker. + + ``chunk(input_text)`` accepts either a markdown string or an HTML string + — format auto-detected by leading ``<`` content (HTML) versus anything + else (markdown). For multi-page PDF inputs, callers should instead use + ``chunk_pages(pages)`` with the per-page dict list from + ``pymupdf4llm.to_markdown(..., page_chunks=True)`` so page numbers + propagate to chunk metadata. + """ + + def __init__( + self, + chunk_size: int = 0, + overlap_size: int = -1, + ): + self.chunk_size = chunk_size if chunk_size > 0 else _DEFAULT_CHUNK_SIZE + self.overlap_size = ( + overlap_size if overlap_size >= 0 else self.chunk_size // _DEFAULT_OVERLAP_DIV + ) + + def chunk(self, input_text: str) -> List[StructuredChunk]: + elements = self._detect_and_tokenize(input_text) + return pack(elements, max_chars=self.chunk_size, overlap=self.overlap_size) + + def chunk_pages(self, pages: Iterable[dict]) -> List[StructuredChunk]: + elements = markdown_pages_to_elements(pages) + return pack(elements, max_chars=self.chunk_size, overlap=self.overlap_size) + + @staticmethod + def _detect_and_tokenize(text: str) -> List[Element]: + stripped = (text or "").lstrip() + looks_html = stripped.startswith("<") and ( + "**別**
            **信**
            **用**... +# 個


            用... +# which bloat tokens 3-5x and confuse retrieval embeddings. The CJK +# Unicode ranges below cover CJK Unified Ideographs (U+4E00-U+9FFF), +# Hiragana / Katakana / CJK Symbols (U+3000-U+30FF), and full-width +# / half-width forms (U+FF00-U+FFEF). +_CJK_CHAR_CLASS = r"[ -鿿＀-￯]" +_VERTICAL_BOLD_CJK = re.compile( + rf"(?:\*\*{_CJK_CHAR_CLASS}\*\*(?:)){{2,}}\*\*{_CJK_CHAR_CLASS}\*\*" +) +_VERTICAL_CJK = re.compile( + rf"(?:{_CJK_CHAR_CLASS}){{2,}}{_CJK_CHAR_CLASS}" +) + +# Within-cell
            tags inside markdown table rows. pymupdf4llm uses these +# to mark visual line breaks inside a single cell (vertical-numeric runs +# like ``|3
            4
            5|``, or single-character mojibake glyph sequences). +# Whatever the cause, the result is a cell that retrieval treats as +# multiple unrelated tokens. Stripping ``
            `` inside ``|...|`` rows +# reunites the cell text on one logical line; ``
            `` outside table +# rows is left alone since it usually marks an intentional break. +_TABLE_LINE_RE = re.compile(r"^\s*\|") +_BR_TAG_RE = re.compile(r"", re.IGNORECASE) + +# Mojibake detection: PDFs whose embedded font CMap can't be resolved +# emit runs of Latin-1 supplement characters (À-ÿ, ¡-¿), control glyphs, +# or U+FFFD replacement characters. None of these are expected in +# legitimate Japanese or English text at high density. A line whose +# share of suspicious characters exceeds the threshold gets logged. +_MOJIBAKE_HIGH_LATIN1 = re.compile(r"[ -ÿ€-Ÿ]") +_MOJIBAKE_REPLACEMENT = "�" +_MOJIBAKE_LINE_RATIO = 0.20 # report lines where >=20% of chars look corrupt +_MOJIBAKE_MIN_LINE_LEN = 8 + + +def _detect_mojibake(text: str, source_hint: str = "") -> list[dict]: + """Scan markdown for lines that look like failed glyph decoding. + + Returns a list of finding dicts with line_no, ratio, sample. Callers + log these so PDFs with broken CMaps can be flagged for re-extraction + or OCR fallback. We do not attempt to repair the text in-place — + upstream extraction is the only place where the original glyphs can + actually be recovered. + """ + findings: list[dict] = [] + if not text: + return findings + for line_no, line in enumerate(text.split("\n"), 1): + if len(line) < _MOJIBAKE_MIN_LINE_LEN: + continue + suspicious = len(_MOJIBAKE_HIGH_LATIN1.findall(line)) + replacement = line.count(_MOJIBAKE_REPLACEMENT) + weighted = suspicious + replacement * 5 + ratio = weighted / max(1, len(line)) + if ratio >= _MOJIBAKE_LINE_RATIO: + findings.append({ + "line_no": line_no, + "ratio": round(ratio, 3), + "suspicious_chars": suspicious, + "replacement_chars": replacement, + "sample": line[:160], + "source": source_hint, + }) + return findings + + +def _strip_br_in_table_rows(text: str) -> str: + """Remove ``
            `` tags inside markdown table rows. + + Rationale documented at _TABLE_LINE_RE. + """ + out: list[str] = [] + for line in text.split("\n"): + if _TABLE_LINE_RE.match(line): + line = _BR_TAG_RE.sub(" ", line) + out.append(line) + return "\n".join(out) + + +def _collapse_vertical_cjk(text: str) -> str: + """Collapse pymupdf4llm's per-character vertical-CJK runs back into a + single token. Bold runs ``**X**
            **Y**
            **Z**`` become ``**XYZ**``; + non-bold runs ``X
            Y
            Z`` become ``XYZ``. + + Only operates on runs of three or more contiguous CJK characters + separated by ``
            `` tags — incidental two-character ``
            ``-joined + pairs aren't matched so we don't disturb legitimate inline content. + """ + def _fix_bold(m: re.Match) -> str: + chars = re.findall(rf"\*\*({_CJK_CHAR_CLASS})\*\*", m.group(0)) + return f"**{''.join(chars)}**" if chars else m.group(0) + + def _fix_plain(m: re.Match) -> str: + return re.sub(r"", "", m.group(0)) -def _clean_pdf_markdown(markdown: str) -> str: + text = _VERTICAL_BOLD_CJK.sub(_fix_bold, text) + return _VERTICAL_CJK.sub(_fix_plain, text) + + +def _clean_pdf_markdown(markdown: str, source_hint: str = "") -> str: """Apply post-processing to markdown produced by pymupdf4llm for form PDFs. - Two specific artefacts are fixed: + Three specific artefacts are fixed: 1. **Duplicate table rows** — complex form PDFs (e.g. IRS forms) often have overlapping text layers (a rendered background layer plus a searchable text @@ -49,11 +152,42 @@ def _clean_pdf_markdown(markdown: str) -> str: cannot derive a header from the PDF's column structure. These are replaced with empty strings so the table is still valid markdown but does not expose internal artefacts to downstream consumers. + + 3. **Vertical-CJK runs** — Japanese / Chinese / Korean characters laid out + vertically in a PDF table cell get emitted as one character per line + with ``
            `` separators and per-character bold markers. The run is + collapsed back into a single token so embedding and retrieval see the + intended word (e.g. ``**個別信用購入あっせん**``) rather than ten + fragments. """ # --- Pass 1: remove ColN placeholders --- markdown = _coln_pattern.sub('', markdown) - # --- Pass 2: deduplicate consecutive table rows --- + # --- Pass 2: collapse vertical-CJK runs (do this BEFORE row dedup so + # rows that differ only by the collapsed form aren't treated as + # distinct rows). + markdown = _collapse_vertical_cjk(markdown) + + # --- Pass 2b: strip
            inside markdown table rows --- + markdown = _strip_br_in_table_rows(markdown) + + # --- Pass 2c: log lines that look like mojibake (failed glyph decode). + # We don't repair these — the underlying glyphs aren't recoverable + # from the markdown — but logging gives operators a grep target. + findings = _detect_mojibake(markdown, source_hint) + if findings: + logger.warning( + "[CONVERSION ISSUE] %s: %d line(s) look like mojibake / glyph-decode failure (first 3 shown)", + source_hint or "", + len(findings), + ) + for f in findings[:3]: + logger.warning( + "[CONVERSION ISSUE] line %d (ratio=%.2f, suspicious=%d, replacement=%d): %r", + f["line_no"], f["ratio"], f["suspicious_chars"], f["replacement_chars"], f["sample"], + ) + + # --- Pass 3: deduplicate consecutive table rows --- lines = markdown.splitlines() cleaned: list[str] = [] for line in lines: @@ -67,7 +201,15 @@ def _clean_pdf_markdown(markdown: str) -> str: continue cleaned.append(line) - return '\n'.join(cleaned) + markdown = '\n'.join(cleaned) + + # --- Pass 4: collapse runs of 3+ blank lines into a single blank + # line. pymupdf4llm emits large vertical whitespace where the PDF + # has visual blank space (e.g. below a chart that fills most of a + # page); these don't add information and bloat chunk sizes. + markdown = re.sub(r"(?:\r?\n[ \t]*){3,}", "\n\n", markdown) + + return markdown def extract_images(md_text): @@ -477,29 +619,55 @@ def _extract_pdf_with_images_as_docs(file_path, base_doc_id, graphname=None): if image_output_folder.exists(): shutil.rmtree(image_output_folder, ignore_errors=True) - # Convert PDF to markdown with extracted image files + # Convert PDF to markdown with extracted image files. # Use lock because pymupdf4llm's table extraction is not thread-safe - # See: https://github.com/pymupdf/PyMuPDF/issues/3241 + # (https://github.com/pymupdf/PyMuPDF/issues/3241). + # + # page_chunks=True returns a list[dict] (one per page) carrying + # per-page metadata. We re-join into a single markdown string with + # `` markers between pages so the structured chunker + # (common/chunkers/structured.py) can attach page_no to each + # emitted chunk. Markdown / character / semantic chunkers ignore + # the comments — they're inert HTML comments to those chunkers. + def _to_markdown_paged(strategy: str | None = None): + kwargs = dict( + write_images=True, + image_path=str(image_output_folder), + margins=0, + image_size_limit=0.08, + page_chunks=True, + ) + if strategy: + kwargs["table_strategy"] = strategy + pages = pymupdf4llm.to_markdown(file_path, **kwargs) + if not isinstance(pages, list): + return pages or "" + parts = [] + for p in pages: + page_no = None + meta = p.get("metadata") or {} + # pymupdf4llm exposes the page index under ``page_number`` + # (1-based) in each chunk's metadata. ``page`` is the + # filename-style label and not always populated. + for key in ("page_number", "page"): + if key in meta: + try: + page_no = int(meta[key]) + break + except (TypeError, ValueError): + page_no = None + if page_no is not None: + parts.append(f"") + parts.append(p.get("text") or "") + return "\n\n".join(parts) + with _pymupdf4llm_lock: try: - markdown_content = pymupdf4llm.to_markdown( - file_path, - write_images=True, - image_path=str(image_output_folder), # unique folder per PDF - margins=0, - image_size_limit=0.08, - ) + markdown_content = _to_markdown_paged() except Exception: # Retry with table_strategy="lines" if first attempt fails try: - markdown_content = pymupdf4llm.to_markdown( - file_path, - write_images=True, - image_path=str(image_output_folder), # unique folder per PDF - margins=0, - image_size_limit=0.08, - table_strategy="lines", - ) + markdown_content = _to_markdown_paged(strategy="lines") except Exception as e: logger.error(f"pymupdf4llm failed for {file_path}: {e}") # Cleanup folder if it was created @@ -527,7 +695,7 @@ def _extract_pdf_with_images_as_docs(file_path, base_doc_id, graphname=None): }] # Clean up artefacts common in form PDFs (duplicate rows, ColN headers) - markdown_content = _clean_pdf_markdown(markdown_content) + markdown_content = _clean_pdf_markdown(markdown_content, source_hint=str(file_path)) # Rename image files that contain spaces to avoid path-parsing issues markdown_content = _sanitize_image_filenames(image_output_folder, markdown_content) diff --git a/ecc/tests/README_chunkers.md b/ecc/tests/README_chunkers.md new file mode 100644 index 0000000..09b1881 --- /dev/null +++ b/ecc/tests/README_chunkers.md @@ -0,0 +1,165 @@ +# Chunker Testing + +This directory contains comprehensive tests for testing different text chunkers used in the GraphRAG ECC (Eventual Consistency Checker) application. + +## Files + +- `test_chunkers.py` - Full test suite with unittest framework +- `test_chunkers_demo.py` - Simple demo script that can be run directly +- `README_chunkers.md` - This file + +## What are Chunkers? + +Chunkers are components that break down large text documents into smaller, manageable pieces (chunks) for processing by AI models. Different chunking strategies are useful for different types of content and use cases. + +## Available Chunkers + +1. **Character Chunker** - Splits text by character count with optional overlap +2. **Regex Chunker** - Splits text using regular expression patterns +3. **Markdown Chunker** - Splits text while preserving markdown structure +4. **Recursive Chunker** - Intelligently splits text using multiple separators +5. **Semantic Chunker** - Splits text based on semantic similarity (requires embedding service) + +## Running the Tests + +### Option 1: Run the Demo Script (Recommended for quick testing) + +```bash +cd graphrag/ecc/tests/app +python test_chunkers_demo.py +``` + +This will run all chunkers with sample text and show you exactly what chunks are produced by each one. + +### Option 2: Run the Full Test Suite + +```bash +cd graphrag/ecc/tests/app +python -m unittest test_chunkers.py -v +``` + +### Option 3: Run Specific Test Methods + +```bash +cd graphrag/ecc/tests/app +python -m unittest test_chunkers.TestChunkers.test_character_chunker -v +python -m unittest test_chunkers.TestChunkers.test_markdown_chunker -v +``` + +## Sample Output + +The tests will show you: + +- **Total number of chunks** produced by each chunker +- **Individual chunk content** with length information +- **Configuration parameters** used (chunk size, overlap, patterns) +- **Performance comparison** between different chunkers +- **Edge case handling** (empty strings, short text, etc.) + +Example output: +``` +============================================================ +1. CHARACTER CHUNKER +============================================================ +Chunk size: 150, Overlap: 15 +Total chunks: 8 +Total characters: 1089 + +--- Chunk 1 (Length: 150) --- +# Introduction to GraphRAG + +GraphRAG is a powerful framework for building Retrieval-Augmented Generation (RAG) systems using graph databases. + +## What is RAG? + +Retrieval-Augmented Generation (RAG) is a technique that combines the power of large language models with external knowledge retrieval. It allows AI systems to access and use information that wasn't part of their training data. + +## Key Components + +1. **Document Ingestion**: Documents are processed and chunked into smaller pieces +2. **Embedding Generation**: Each chunk is converted into a vector representation +3. **Vector Storage**: Embeddings are stored in a vector database for efficient retrieval +4. **Query Processing**: User queries are processed and relevant chunks are retrieved +5. **Response Generation**: The LLM generates responses based on retrieved context + +## Benefits + +- Improved accuracy through access to current information +- Reduced hallucination by grounding responses in retrieved facts +- Scalable knowledge management +- Cost-effective compared to fine-tuning + +This framework provides a robust foundation for building enterprise-grade RAG applications. +... +``` + +## Test Coverage + +The test suite covers: + +- **Basic functionality** of each chunker +- **Different configurations** (chunk sizes, overlap sizes, patterns) +- **Edge cases** (empty strings, short text, exact chunk sizes) +- **Performance comparison** between chunkers +- **Integration** with the `get_chunker` utility function +- **Error handling** and validation + +## Customizing Tests + +### Adding New Test Cases + +To add new test cases, edit `test_chunkers.py` and add new test methods: + +```python +def test_my_custom_scenario(self): + """Test a custom scenario""" + # Your test code here + pass +``` + +### Testing with Different Text + +To test with different sample text, modify the `sample_text` variable in the `setUp` method or create new test methods with different text samples. + +### Testing Different Configurations + +Modify the chunker configurations in the test methods to test different parameters: + +```python +chunker = character_chunker.CharacterChunker( + chunk_size=500, # Different chunk size + overlap_size=50 # Different overlap +) +``` + +## Troubleshooting + +### Import Errors + +If you encounter import errors, ensure you're running from the correct directory and that the Python path includes the necessary modules. + +### Mock Errors + +The semantic chunker tests use mocks to avoid actual API calls. If you encounter mock-related errors, check that the mock setup is correct. + +### Configuration Issues + +Some chunkers require specific configuration. Check the chunker-specific test methods for proper configuration examples. + +## Contributing + +When adding new chunkers or modifying existing ones: + +1. Add corresponding tests to `test_chunkers.py` +2. Update the demo script if needed +3. Ensure all tests pass +4. Update this README with new information + +## Dependencies + +The tests require: +- Python 3.7+ +- unittest (built-in) +- mock (built-in in Python 3.3+) +- Access to the GraphRAG common modules + diff --git a/ecc/tests/test_chunkers.py b/ecc/tests/test_chunkers.py new file mode 100644 index 0000000..898f3ac --- /dev/null +++ b/ecc/tests/test_chunkers.py @@ -0,0 +1,357 @@ +import unittest +from unittest.mock import Mock, patch, MagicMock +import sys +import os + +# Add the parent directory to the path to import the modules +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..')) + +from app.ecc_util import get_chunker +from common.chunkers import ( + character_chunker, + regex_chunker, + semantic_chunker, + markdown_chunker, + recursive_chunker +) + + +class TestChunkers(unittest.TestCase): + """Test class for testing different chunkers with sample text""" + + def setUp(self): + """Set up test data and mock objects""" + # Sample text for testing different chunkers + self.sample_text = """# Introduction to GraphRAG + +GraphRAG is a powerful framework for building Retrieval-Augmented Generation (RAG) systems using graph databases. + +## What is RAG? + +Retrieval-Augmented Generation (RAG) is a technique that combines the power of large language models with external knowledge retrieval. It allows AI systems to access and use information that wasn't part of their training data. + +## Key Components + +1. **Document Ingestion**: Documents are processed and chunked into smaller pieces +2. **Embedding Generation**: Each chunk is converted into a vector representation +3. **Vector Storage**: Embeddings are stored in a vector database for efficient retrieval +4. **Query Processing**: User queries are processed and relevant chunks are retrieved +5. **Response Generation**: The LLM generates responses based on retrieved context + +## Benefits + +- Improved accuracy through access to current information +- Reduced hallucination by grounding responses in retrieved facts +- Scalable knowledge management +- Cost-effective compared to fine-tuning + +This framework provides a robust foundation for building enterprise-grade RAG applications.""" + + # Mock embedding service for semantic chunker + self.mock_embedding_service = Mock() + self.mock_embedding_service.embeddings = Mock() + + # Mock configuration + self.mock_config = { + "chunker": "semantic", + "chunker_config": { + "method": "percentile", + "threshold": 0.95, + "chunk_size": 512, + "overlap_size": 50, + "pattern": "\\r?\\n" + } + } + + def test_character_chunker(self): + """Test character-based chunking""" + print("\n" + "="*60) + print("TESTING CHARACTER CHUNKER") + print("="*60) + + # Create character chunker directly + chunker = character_chunker.CharacterChunker( + chunk_size=200, + overlap_size=20 + ) + + chunks = chunker.chunk(self.sample_text) + + print(f"Character Chunker - Chunk Size: 200, Overlap: 20") + print(f"Total chunks: {len(chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in chunks)}") + print(f"Original text length: {len(self.sample_text)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:100] + "..." if len(chunk) > 100 else chunk) + + # Assertions + self.assertIsInstance(chunks, list) + self.assertTrue(len(chunks) > 1) + self.assertTrue(all(len(chunk) <= 200 for chunk in chunks)) + + def test_regex_chunker(self): + """Test regex-based chunking""" + print("\n" + "="*60) + print("TESTING REGEX CHUNKER") + print("="*60) + + # Create regex chunker directly + chunker = regex_chunker.RegexChunker(pattern="\\r?\\n") + + chunks = chunker.chunk(self.sample_text) + + print(f"Regex Chunker - Pattern: \\r?\\n") + print(f"Total chunks: {len(chunks)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:100] + "..." if len(chunk) > 100 else chunk) + + # Assertions + self.assertIsInstance(chunks, list) + self.assertTrue(len(chunks) > 1) + + def test_markdown_chunker(self): + """Test markdown-based chunking""" + print("\n" + "="*60) + print("TESTING MARKDOWN CHUNKER") + print("="*60) + + # Create markdown chunker directly + chunker = markdown_chunker.MarkdownChunker( + chunk_size=300, + chunk_overlap=30 + ) + + chunks = chunker.chunk(self.sample_text) + + print(f"Markdown Chunker - Chunk Size: 300, Overlap: 30") + print(f"Total chunks: {len(chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in chunks)}") + print(f"Original text length: {len(self.sample_text)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:100] + "..." if len(chunk) > 100 else chunk) + + # Assertions + self.assertIsInstance(chunks, list) + self.assertTrue(len(chunks) > 1) + + def test_recursive_chunker(self): + """Test recursive-based chunking""" + print("\n" + "="*60) + print("TESTING RECURSIVE CHUNKER") + print("="*60) + + # Create recursive chunker directly + chunker = recursive_chunker.RecursiveChunker( + chunk_size=250, + overlap_size=25 + ) + + chunks = chunker.chunk(self.sample_text) + + print(f"Recursive Chunker - Chunk Size: 250, Overlap: 25") + print(f"Total chunks: {len(chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in chunks)}") + print(f"Original text length: {len(self.sample_text)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:100] + "..." if len(chunk) > 100 else chunk) + + # Assertions + self.assertIsInstance(chunks, list) + self.assertTrue(len(chunks) > 1) + + @patch('app.ecc_util.graphrag_config') + @patch('app.ecc_util.embedding_service') + def test_semantic_chunker(self, mock_embedding_service, mock_graphrag_config): + """Test semantic chunking through the utility function""" + print("\n" + "="*60) + print("TESTING SEMANTIC CHUNKER") + print("="*60) + + # Mock the configuration + mock_graphrag_config.get.side_effect = lambda key, default=None: { + "chunker": "semantic", + "chunker_config": { + "method": "percentile", + "threshold": 0.95 + } + }.get(key, default) + + # Mock the embedding service + mock_embedding_service.embeddings = Mock() + + # Mock the semantic chunker to avoid actual API calls + with patch('app.ecc_util.semantic_chunker.SemanticChunker') as mock_semantic_class: + mock_chunker_instance = Mock() + mock_chunker_instance.chunk.return_value = [ + "Introduction to GraphRAG", + "What is RAG?", + "Key Components", + "Benefits" + ] + mock_semantic_class.return_value = mock_chunker_instance + + # Get chunker through utility function + chunker = get_chunker("semantic") + chunks = chunker.chunk(self.sample_text) + + print(f"Semantic Chunker - Method: percentile, Threshold: 0.95") + print(f"Total chunks: {len(chunks)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk) + + # Assertions + self.assertIsInstance(chunks, list) + self.assertTrue(len(chunks) > 0) + + def test_get_chunker_utility_function(self): + """Test the get_chunker utility function with different chunker types""" + print("\n" + "="*60) + print("TESTING GET_CHUNKER UTILITY FUNCTION") + print("="*60) + + # Test different chunker types + chunker_types = ["character", "regex", "markdown", "recursive"] + + for chunker_type in chunker_types: + print(f"\n--- Testing {chunker_type.upper()} chunker ---") + + try: + # Mock the configuration for each chunker type + with patch('app.ecc_util.graphrag_config') as mock_config: + mock_config.get.side_effect = lambda key, default=None: { + "chunker": chunker_type, + "chunker_config": { + "chunk_size": 200, + "overlap_size": 20, + "pattern": "\\r?\\n" + } + }.get(key, default) + + # Mock embedding service for semantic chunker + with patch('app.ecc_util.embedding_service') as mock_emb_service: + mock_emb_service.embeddings = Mock() + + # Get chunker + chunker = get_chunker(chunker_type) + + # Test chunking + chunks = chunker.chunk(self.sample_text) + + print(f"Chunker type: {chunker_type}") + print(f"Total chunks: {len(chunks)}") + print(f"First chunk preview: {chunks[0][:50]}...") + + # Assertions + self.assertIsInstance(chunker, object) + self.assertIsInstance(chunks, list) + self.assertTrue(len(chunks) > 0) + + except Exception as e: + print(f"Error testing {chunker_type} chunker: {e}") + continue + + def test_chunker_edge_cases(self): + """Test chunkers with edge cases""" + print("\n" + "="*60) + print("TESTING CHUNKER EDGE CASES") + print("="*60) + + # Test with empty string + empty_text = "" + print("\n--- Testing with empty string ---") + + chunker = character_chunker.CharacterChunker(chunk_size=100) + chunks = chunker.chunk(empty_text) + print(f"Empty string chunks: {chunks}") + self.assertEqual(chunks, []) + + # Test with very short text + short_text = "Hello" + print("\n--- Testing with short text ---") + + chunks = chunker.chunk(short_text) + print(f"Short text chunks: {chunks}") + self.assertEqual(chunks, ["Hello"]) + + # Test with text exactly chunk size + exact_text = "A" * 100 + print("\n--- Testing with text exactly chunk size ---") + + chunks = chunker.chunk(exact_text) + print(f"Exact chunk size chunks: {len(chunks)}") + self.assertEqual(len(chunks), 1) + self.assertEqual(len(chunks[0]), 100) + + def test_chunker_performance_comparison(self): + """Compare performance and output characteristics of different chunkers""" + print("\n" + "="*60) + print("CHUNKER PERFORMANCE COMPARISON") + print("="*60) + + chunker_configs = [ + ("character", {"chunk_size": 200, "overlap_size": 20}), + ("markdown", {"chunk_size": 200, "chunk_overlap": 20}), + ("recursive", {"chunk_size": 200, "overlap_size": 20}) + ] + + results = {} + + for chunker_name, config in chunker_configs: + print(f"\n--- {chunker_name.upper()} Chunker ---") + + if chunker_name == "character": + chunker = character_chunker.CharacterChunker(**config) + elif chunker_name == "markdown": + chunker = markdown_chunker.MarkdownChunker(**config) + elif chunker_name == "recursive": + chunker = recursive_chunker.RecursiveChunker(**config) + + chunks = chunker.chunk(self.sample_text) + + # Calculate statistics + chunk_lengths = [len(chunk) for chunk in chunks] + avg_length = sum(chunk_lengths) / len(chunk_lengths) if chunk_lengths else 0 + min_length = min(chunk_lengths) if chunk_lengths else 0 + max_length = max(chunk_lengths) if chunk_lengths else 0 + + results[chunker_name] = { + "total_chunks": len(chunks), + "avg_chunk_length": avg_length, + "min_chunk_length": min_length, + "max_chunk_length": max_length, + "total_characters": sum(chunk_lengths) + } + + print(f"Total chunks: {len(chunks)}") + print(f"Average chunk length: {avg_length:.1f}") + print(f"Min chunk length: {min_length}") + print(f"Max chunk length: {max_length}") + print(f"Total characters: {sum(chunk_lengths)}") + + # Print summary comparison + print("\n" + "="*60) + print("SUMMARY COMPARISON") + print("="*60) + + for chunker_name, stats in results.items(): + print(f"\n{chunker_name.upper()}:") + print(f" Chunks: {stats['total_chunks']}") + print(f" Avg Length: {stats['avg_chunk_length']:.1f}") + print(f" Length Range: {stats['min_chunk_length']}-{stats['max_chunk_length']}") + print(f" Total Chars: {stats['total_characters']}") + + +if __name__ == "__main__": + # Run the tests with verbose output + unittest.main(verbosity=2) + diff --git a/ecc/tests/test_chunkers_demo.py b/ecc/tests/test_chunkers_demo.py new file mode 100644 index 0000000..325c19f --- /dev/null +++ b/ecc/tests/test_chunkers_demo.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +""" +Demo script to test different chunkers with sample text. +This script can be run directly to see how different chunkers work. +""" + +import sys +import os + +# Add the parent directory to the path to import the modules +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..')) + +from common.chunkers import ( + character_chunker, + regex_chunker, + semantic_chunker, + markdown_chunker, + recursive_chunker +) + + +def test_chunkers(): + """Test different chunkers with sample text and print results""" + + # Sample text for testing + sample_text = """# Introduction to GraphRAG + +GraphRAG is a powerful framework for building Retrieval-Augmented Generation (RAG) systems using graph databases. + +## What is RAG? + +Retrieval-Augmented Generation (RAG) is a technique that combines the power of large language models with external knowledge retrieval. It allows AI systems to access and use information that wasn't part of their training data. + +## Key Components + +1. **Document Ingestion**: Documents are processed and chunked into smaller pieces +2. **Embedding Generation**: Each chunk is converted into a vector representation +3. **Vector Storage**: Embeddings are stored in a vector database for efficient retrieval +4. **Query Processing**: User queries are processed and relevant chunks are retrieved +5. **Response Generation**: The LLM generates responses based on retrieved context + +## Benefits + +- Improved accuracy through access to current information +- Reduced hallucination by grounding responses in retrieved facts +- Scalable knowledge management +- Cost-effective compared to fine-tuning + +This framework provides a robust foundation for building enterprise-grade RAG applications.""" + + print("=" * 80) + print("CHUNKER TESTING DEMO") + print("=" * 80) + print(f"Sample text length: {len(sample_text)} characters") + print("=" * 80) + + # Test 1: Character Chunker + print("\n" + "=" * 60) + print("1. CHARACTER CHUNKER") + print("=" * 60) + + char_chunker = character_chunker.CharacterChunker( + chunk_size=150, + overlap_size=15 + ) + + char_chunks = char_chunker.chunk(sample_text) + print(f"Chunk size: 150, Overlap: 15") + print(f"Total chunks: {len(char_chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in char_chunks)}") + + for i, chunk in enumerate(char_chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk) + if len(chunk) > 100: + print("...") + + # Test 2: Regex Chunker + print("\n" + "=" * 60) + print("2. REGEX CHUNKER") + print("=" * 60) + + regex_chunker_instance = regex_chunker.RegexChunker(pattern="\\r?\\n") + regex_chunks = regex_chunker_instance.chunk(sample_text) + + print(f"Pattern: \\r?\\n (split on newlines)") + print(f"Total chunks: {len(regex_chunks)}") + + for i, chunk in enumerate(regex_chunks): + if chunk.strip(): # Only show non-empty chunks + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk.strip()) + if len(chunk) > 100: + print("...") + + # Test 3: Markdown Chunker + print("\n" + "=" * 60) + print("3. MARKDOWN CHUNKER") + print("=" * 60) + + md_chunker = markdown_chunker.MarkdownChunker( + chunk_size=200, + chunk_overlap=20 + ) + + md_chunks = md_chunker.chunk(sample_text) + print(f"Chunk size: 200, Overlap: 20") + print(f"Total chunks: {len(md_chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in md_chunks)}") + + for i, chunk in enumerate(md_chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk) + if len(chunk) > 100: + print("...") + + # Test 4: Recursive Chunker + print("\n" + "=" * 60) + print("4. RECURSIVE CHUNKER") + print("=" * 60) + + rec_chunker = recursive_chunker.RecursiveChunker( + chunk_size=180, + overlap_size=18 + ) + + rec_chunks = rec_chunker.chunk(sample_text) + print(f"Chunk size: 180, Overlap: 18") + print(f"Total chunks: {len(rec_chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in rec_chunks)}") + + for i, chunk in enumerate(rec_chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk) + if len(chunk) > 100: + print("...") + + # Test 5: Different configurations comparison + print("\n" + "=" * 60) + print("5. CONFIGURATION COMPARISON") + print("=" * 60) + + configs = [ + {"chunk_size": 100, "overlap_size": 10}, + {"chunk_size": 200, "overlap_size": 20}, + {"chunk_size": 300, "overlap_size": 30} + ] + + for config in configs: + print(f"\n--- Character Chunker: {config} ---") + chunker = character_chunker.CharacterChunker(**config) + chunks = chunker.chunk(sample_text) + + chunk_lengths = [len(chunk) for chunk in chunks] + avg_length = sum(chunk_lengths) / len(chunk_lengths) if chunk_lengths else 0 + + print(f" Total chunks: {len(chunks)}") + print(f" Average chunk length: {avg_length:.1f}") + print(f" Min chunk length: {min(chunk_lengths) if chunk_lengths else 0}") + print(f" Max chunk length: {max(chunk_lengths) if chunk_lengths else 0}") + + # Test 6: Edge cases + print("\n" + "=" * 60) + print("6. EDGE CASES") + print("=" * 60) + + # Empty string + empty_chunks = char_chunker.chunk("") + print(f"Empty string: {empty_chunks}") + + # Very short text + short_chunks = char_chunker.chunk("Hello") + print(f"Short text 'Hello': {short_chunks}") + + # Text exactly chunk size + exact_text = "A" * 150 + exact_chunks = char_chunker.chunk(exact_text) + print(f"Text exactly 150 chars: {len(exact_chunks)} chunks") + + # Summary + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + print(f"Character chunks: {len(char_chunks)}") + print(f"Regex chunks: {len(regex_chunks)}") + print(f"Markdown chunks: {len(md_chunks)}") + print(f"Recursive chunks: {len(rec_chunks)}") + print("=" * 80) + + +if __name__ == "__main__": + try: + test_chunkers() + except Exception as e: + print(f"Error running chunker tests: {e}") + import traceback + traceback.print_exc() + diff --git a/ecc/tests/test_chunkers_direct.py b/ecc/tests/test_chunkers_direct.py new file mode 100644 index 0000000..290c169 --- /dev/null +++ b/ecc/tests/test_chunkers_direct.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python3 +""" +Direct test script for testing different chunkers with sample text. +This version imports chunkers directly to avoid dependency issues. +""" + +import sys +import os +from common.chunkers import CharacterChunker, RegexChunker, MarkdownChunker, RecursiveChunker, SemanticChunker + +sample_text = """![Title: BARCLAYS\nThe image shows the Barclays logo, which is a blue eagle symbol with the text "BARCLAYS" below it.](s3://barclays-output/bda/output/BarclaysDocs/CoreRangeCustomerRateSheet.pdf/d4bd1574-c073-410f-82f9-8ce02e41622e/0/standard_output/0/assets/8c0315a8-8df5-40e7-955d-7315890649f1.png)\n\nBARCLAYS\n\n# Rates Effective From 30 April 2025\n\nThis Rate Sheet is for information only and does not constitute a Personalised Mortgage Information Sheet. Rates are subject to change and withdrawal at short notice.\n\nCustomers remortgaging may be able to use the Switch & Save and Great Escape remortgage package. As part of the remortgage package, we offer a legal service via a panel of solicitors who act on our behalf only. The legal service is not available should a customer choose not to use the appointed panel solicitor. Customers will not pay for or receive a copy of the non-disclosed valuation report. Switch & Save\u1d40M products are generally only available to customers who have a mortgage with another lender (not Barclays or Woolwich), who want to move their mortgage to us and are not moving home. In the case of a remortgage where it includes, for example, a change of names on the mortgage or the first registration of unregistered land, customers are liable for any associated fees.\n\nFor Residential purchase applications with a property value up to \u00a32 million and all Remortgage applications, customers will not pay for or receive a copy of any non-disclosed valuation report. For Residential purchase applications with a property value over \u00a32 million please see the Tariff of Charges at In order to confirm the most up to date rates available, please speak to our mortgage advisers for full details.\n\nThe Bank of England Base Rate (BEBR) is a variable rate set by the Bank of England. **BEBR is currently** [ **4.50%.** ](4.50%.)\n\nRepresentative example:\n\nA capital and interest mortgage of \u00a3198,000 payable over 300 months on a Fixed rate of for 2 years and then our variable tracker rate of above the [ 4.27% ](4.27%) [ 1.99% ](1.99%) Bank of England Base Rate (currently 4.50%), for the remaining term would require 24 monthly payments of \u00a31074.86 and 276 monthly payments of \u00a31318.26. The total amount payable would be \u00a3389,741.40 made up of the loan amount plus interest and \u00a30 (product fee), \u00a380 (final repayment charge), \u00a325 (completion fee). The overall cost for comparison is APRC representative. [ 6.2% ](6.2%)\n\nDuring an early repayment charge period, capital reductions within a set allowance can be made without incurring the charge. **The allowance for Premier exclusive fixed rates is 25% per annum (only applies for new mortgage applications from April 19th 2024 onwards), all other fixed rates are 10% per annum. The allowance for tracker rates is 25% per annum. For Offset mortgages early repayment charges are only incurred if the mortgage is fully redeemed within the charge period.**\n\n**Residential Purchase Rates**\n\n**Offset Products**\n\n[ 1.22% ](1.22%)\n\n[ 5.72% ](5.72%)\n\n[ 6.6% ](6.6%)\n\n[ 4.50% ](4.50%) [ 1.99% ](1.99%)\n\n[ 6.49% ](6.49%)\n\n[ 5.75% ](5.75%)\n\n[ 1.25% ](1.25%)\n\n[ 6.5% ](6.5%)\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | Maximum Loan to Value (LTV) | Early Repayment Charge |\n|---------------------------------------------|-----------------------------------|-------------------------|------------------------------------------------------------------|-----------------------------------|---------------|-------------------------------|---------------------------------------------------------|\n| 2 Year Offset Tracker at BEBR + for 2 Years | Minimum Loan \u00a35k Maximum Loan \u00a32m | | Offset Tracker at BEBR which is variable, currently + = for term | APRC | \u00a31749 | 75% | Full redemption: 1% of the original balance for 2 years |\n| 5 Year Offset Tracker at BEBR + for 5 Years | | | | APRC | \u00a31749 | 75% | Full redemption: 1% of the original balance for 5 years |\n\nOffset: Current Account and Savings Accounts can be offset against the mortgage, see Offset Terms & Conditions for more information. Please note customers can only hold one Offset Mortgage at a time\n\n**Tracker Products**\n\n**Premier Exclusive Rates -** **Customers must hold a** **Wealth /** **Premier Banking relationship with Barclays to apply for these products and qualify for retail underwriting**\n\n[ 0.21% ](0.21%)\n\n[ 4.71% ](4.71%)\n\n[ 6.4% ](6.4%)\n\n[ 0.49% ](0.49%)\n\n[ 6.4% ](6.4%)\n\n[ 4.99% ](4.99%)\n\n[ 6.4% ](6.4%)\n\n[ 0.36% ](0.36%)\n\n[ 4.86% ](4.86%)\n\n[ 5.26% ](5.26%)\n\n[ 0.76% ](0.76%)\n\n[ 6.5% ](6.5%)\n\n[ 1.10% ](1.10%)\n\n[ 5.60% ](5.60%)\n\n[ 6.6% ](6.6%)\n\n[ 0.50% ](0.50%)\n\n[ 4.50% ](4.50%)\n\n[ 5.00% ](5.00%)\n\n[ 6.4% ](6.4%)\n\n[ 1.99% ](1.99%) [ 6.49% ](6.49%)\n\n[ 0.35% ](0.35%)\n\n[ 4.85% ](4.85%)\n\n[ 6.4% ](6.4%)\n\n[ 0.55% ](0.55%)\n\n[ 5.05% ](5.05%)\n\n[ 6.4% ](6.4%)\n\n[ 0.57% ](0.57%)\n\n[ 5.07% ](5.07%)\n\n[ 6.4% ](6.4%)\n\n[ 0.60% ](0.60%)\n\n[ 6.1% ](6.1%)\n\n[ 5.10% ](5.10%)\n\n[ 1.00% ](1.00%)\n\n[ 6.3% ](6.3%)\n\n[ 5.50% ](5.50%)\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | LTV | Early Repayment Charge |\n|--------------------------------------------------------|-------------------------------------|-------------------------|------------------------------------------------|-----------------------------------|---------------|-------|--------------------------------------|\n| 2 Year Tracker at BEBR + for 2 Years | Minimum Loan \u00a35k Maximum Loan \u00a32m | | BEBR which is variable, currently + = for term | APRC | \u00a3999 | 60% | No ERC |\n| 2 Year Tracker at BEBR + for 2 Years | | | | APRC | \u00a30 | 60% | 1% of the balance repaid for 2 years |\n| 2 Year Tracker at BEBR + for 2 Years | | | | APRC | \u00a3999 | 75% | No ERC |\n| 2 Year Tracker at BEBR + for 2 Years | | | | APRC | \u00a3999 | 85% | |\n| 2 Year Tracker at BEBR + for 2 Years | Minimum Loan \u00a35k Maximum Loan \u00a3640k | | | APRC | \u00a3999 | 90% | |\n| Premier Exclusive 2 Year Tracker at BEBR + for 2 Years | Minimum Loan \u00a35k Maximum Loan \u00a32m | | | APRC | \u00a30 | 75% | 1% of the balance repaid for 2 years |\n| 2 Year Tracker at BEBR + for 2 Years | Minimum Loan \u00a32m Maximum Loan \u00a310m | | | APRC | \u00a31999 | 60% | No ERC |\n| 2 Year Tracker at BEBR + for 2 Years | | | | APRC | \u00a31999 | 70% | |\n| 2 Year Tracker at BEBR + for 2 Years | Minimum Loan \u00a32m Maximum Loan \u00a35m | | | APRC | \u00a31999 | 75% | |\n| 5 Year Tracker at BEBR + for 5 Years | Minimum Loan \u00a35k Maximum Loan \u00a32m | | | APRC | \u00a3999 | 60% | |\n| 5 Year Tracker at BEBR + for 5 Years | | | | APRC | \u00a3999 | 85% | |\n\n**YOUR HOME MAY BE REPOSSESSED IF YOU DO NOT KEEP UP REPAYMENTS ON YOUR MORTGAGE**\n\n**Barclays Bank UK PLC. Authorised by the Prudential Regulation Authority and regulated by the Financial Conduct Authority and the Prudential Regulation Authority (Financial Services Register No. 759676). Registered in England. Registered No. 9740322 Registered Office: 1** **Churchill Place, London E14 5HP.**\n\n**Page 1** **of 7** ![Title: BARCLAYS\nThe image shows the Barclays logo, which is a blue eagle symbol with the text "BARCLAYS" underneath.](s3://barclays-output/bda/output/BarclaysDocs/CoreRangeCustomerRateSheet.pdf/d4bd1574-c073-410f-82f9-8ce02e41622e/0/standard_output/0/assets/9033f6aa-c6c3-4f01-8691-1bea87591145.png)\n\nBARCLAYS\n\n# Rates Effective From 30 April 2025\n\n## Residential Purchase Rates\n\n## 2 Year Fixed Products\n\n**Premier Exclusive Rates -** **Customers must hold a** **Wealth /** **Premier Banking relationship with Barclays to apply for these products and qualify for retail underwriting**\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | LTV | Early Repayment Charge |\n|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------|-------------------------|------------------------------------------------------------------|-----------------------------------|---------------|-------|-------------------------------------------|\n| Premier Exclusive 2 Year Fixed Rate (until 30/06/2027) | Minimum Loan \u00a35k Maximum Loan \u00a32m | 3.91% | BEBR which is variable, currently 4.50% + 1.99% = 6.49% for term | 6.2% APRC | \u00a3899 | 60% | 2% of the balance repaid until 30/06/2027 |\n| 2 Year Fixed Rate (until 30/06/2027) | | 3.92% | | 6.2% APRC | \u00a3899 | 60% | |\n| | | 4.18% | | 6.2% APRC | \u00a30 | 60% | |\n| Premier Exclusive 2 Year Fixed Rate (until 30/06/2027) | | 3.98% | | 6.2% APRC | \u00a3899 | 75% | |\n| 2 Year Fixed Rate (until 30/06/2027) | | 3.99% | | 6.2% APRC | \u00a3899 | 75% | |\n| | | 4.27% | | 6.2% APRC | \u00a30 | 75% | |\n| | | 4.25% | | 6.3% APRC | \u00a3899 | 85% | |\n| | | 4.45% | | 6.3% APRC | \u00a30 | 85% | |\n| Premier Exclusive 2 Year Fixed Rate (until 30/06/2027) | Minimum Loan \u00a35k Maximum Loan \u00a3640k | 4.64% | | 6.4% APRC | \u00a3899 | 90% | |\n| 2 Year Fixed Rate (until 30/06/2027) | | 4.65% | | 6.4% APRC | \u00a3899 | 90% | |\n| | | 4.84% | | 6.4% APRC | \u00a30 | 90% | |\n| | Minimum Loan \u00a32m Maximum Loan \u00a310m | 4.21% | | 6.2% APRC | \u00a31999 | 60% | |\n| | | 4.36% | | 6.3% APRC | \u00a31999 | 70% | |\n| | Minimum Loan \u00a32m Maximum Loan \u00a35m | 4.52% | | 6.3% APRC | \u00a31999 | 75% | |\n| | | 4.67% | | 6.3% APRC | \u00a31999 | 85% | |\n| 3 Year Fixed Products Premier Exclusive Rates - Customers must hold a Wealth / Premier Banking relationship with Barclays to apply for these products and qualify for retail underwriting | | | | | | | |\n| Premier Exclusive 3 Year Fixed Rate (until 30/06/2028) | Minimum Loan \u00a35k Maximum Loan \u00a32m | 3.99% | BEBR which is variable, currently 4.50% + 1.99% = 6.49% for term | 6.0% APRC | \u00a3899 | 60% | 3% of the balance repaid until 30/06/2028 |\n| 5 Year Fixed Products Premier Exclusive Rates - Customers must hold a Wealth / Premier Banking relationship with Barclays to apply for these products and qualify for retail underwriting | | | | | | | |\n| Premier Exclusive 5 Year Fixed Rate (until 30/06/2030) | Minimum Loan \u00a35k Maximum Loan \u00a32m | 3.92% | BEBR which is variable, currently 4.50% + 1.99% = 6.49% for term | 5.6% APRC | \u00a3899 | 60% | 4% of the balance repaid until 30/06/2030 |\n| 5 Year Fixed Rate (until 30/06/2030) | | 3.93% | | 5.6% APRC | \u00a3899 | 60% | |\n| | | 4.15% | | 5.6% APRC | \u00a30 | 60% | |\n| 5 Year Fixed Rate (until 30/06/2030) | | 4.14% | | 5.7% APRC | \u00a3899 | 75% | |\n| | | 4.27% | | 5.7% APRC | \u00a30 | 75% | |\n| | | 4.28% | | 5.8% APRC | \u00a3899 | 85% | |\n| | | 4.37% | | 5.7% APRC | \u00a30 | 85% | |\n| Premier Exclusive 5 Year Fixed Rate (until 30/06/2030) | Minimum Loan \u00a35k Maximum Loan \u00a3640k | 4.62% | | 5.9% APRC | \u00a3899 | 90% | |\n| 5 Year Fixed Rate (until 30/06/2030) | | 4.67% | | 5.9% APRC | \u00a3899 | 90% | |\n| 5 Year Fixed Rate (until 30/06/2030) | Minimum Loan \u00a32m Maximum Loan \u00a310m | 4.28% | | 5.7% APRC | \u00a31999 | 60% | |\n| | | 4.37% | | 5.8% APRC | \u00a31999 | 70% | |\n| | Minimum Loan \u00a32m Maximum Loan \u00a35m | 4.47% | | 5.8% APRC | \u00a31999 | 75% | |\n| | | 4.63% | | 5.9% APRC | \u00a31999 | 85% | |\n| 10 Year Fixed Product | | | | | | | |\n| 10 Year Fixed Rate (until 30/06/2035) | Minimum Loan \u00a35k Maximum Loan \u00a31m | 4.95% | BEBR which is variable, currently 4.50% + 1.99% 6.49% for term | 5.6% APRC = | \u00a3999 | 60% | 6% of the balance repaid until 30/06/2035 |\n| | | 5.51% | | 6.0% APRC | \u00a3999 | 80% | |\n\n**YOUR HOME MAY BE REPOSSESSED IF YOU DO NOT KEEP UP REPAYMENTS ON YOUR MORTGAGE**\n\n**Barclays Bank UK PLC. Authorised by the Prudential Regulation Authority and regulated by the Financial Conduct Authority and the Prudential Regulation Authority (Financial Services Register No. 759676). Registered in England. Registered No. 9740322 Registered Office: 1** **Churchill Place, London E14 5HP.**\n\n**Page 2** **of 7** ![Title: BARCLAYS\nThe image shows the Barclays logo, which is a blue eagle symbol with the text "BARCLAYS" underneath.](s3://barclays-output/bda/output/BarclaysDocs/CoreRangeCustomerRateSheet.pdf/d4bd1574-c073-410f-82f9-8ce02e41622e/0/standard_output/0/assets/e2ca4484-4353-4d3f-af7e-76b577bb0bf2.png)\n\nBARCLAYS\n\n# Rates Effective From 30 April 2025\n\n# Residential Purchase Rates\n\n## Barclays Green Home Mortgages\n\nThese products can only be used to purchase a new build residential property with an Energy Efficiency Rating of 81 or higher, or an Energy Efficiency Band of A or B.\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | LTV | Early Repayment Charge |\n|--------------------------------------|-------------------------------------|-------------------------|------------------------------------------------------------------|-----------------------------------|---------------|-------|-------------------------------------------|\n| 2 Year Fixed Rate (until 30/06/2027) | Minimum Loan \u00a35k Maximum Loan \u00a32m | 4.08% | BEBR which is variable, currently 4.50% + 1.99% = 6.49% for term | 6.2% APRC | \u00a30 | 60% | 2% of the balance repaid until 30/06/2027 |\n| | | 4.17% | | 6.2% APRC | \u00a30 | 75% | |\n| | | 4.15% | | 6.3% APRC | \u00a3899 | 85% | |\n| | Minimum Loan \u00a35k Maximum Loan \u00a3640k | 4.74% | | 6.3% APRC | \u00a30 | 90% | |\n| 5 Year Fixed Rate (until 30/06/2030) | Minimum Loan \u00a35k Maximum Loan \u00a32m | 3.83% | | 5.6% APRC | \u00a3899 | 60% | 4% of the balance repaid until 30/06/2030 |\n| | | 4.04% | | 5.6% APRC | \u00a3899 | 75% | |\n| | | 4.18% | | 5.7% APRC | \u00a3899 | 85% | |\n| | Minimum Loan \u00a35k Maximum Loan \u00a3640k | 4.57% | | 5.9% APRC | \u00a3899 | 90% | |\n\n**Satisfactory evidence that the property has an Energy Efficiency Rating of 81 or higher, or has an Energy Efficiency Band of A** **or B, must be provided before the advance is made and shall be either: (a) a** **valid Energy Performance Certificate completed less than 10 years prior to the submission of your mortgage application; or (b) a** **valid Predicted Energy Performance Certificate if the property build phase has not been completed**\n\n## Family Springboard Mortgages\n\nThese products can only be used to purchase a residential property with a mortgage loan size that is over 90% of the property price or value up to a maximum of 100% whichever is the lower and cannot be combined with any other product. Loans outside of this Loan to Value (LTV) will not be allowed.\n\n5 Year Fixed Rate (until 30/06/2030)\n\nMinimum Loan \u00a35k Maximum Loan \u00a3500K\n\n4.97% 5.29%\n\nBEBR which is variable, currently 4.50% + 1.99% = 6.49% for term\n\n6.0% APRC 6.1% APRC\n\n\u00a30 \u00a30\n\n95% 100%\n\n4% of the balance repaid until 30/06/2030\n\n**A Helpful Start Account through Barclays Bank UK PLC must be taken out as a** **condition of this mortgage. Prior to completion of the mortgage advance, the Helpful Start Account must receive a** **deposit, equivalent to 10% of the purchase price of the property. The Helpful Start Account must remain open for a** **minimum period of five years from the date of the completion (subject to mortgage payments being maintained) or until full redemption of the mortgage, whichever is sooner.**\n\n## Mortgage Guarantee Scheme\n\nThese products can only be used to purchase a residential property (excluding new-build) with a mortgage loan size that is over 90% of the property price or value up to a maximum of 95% whichever is the lower and cannot be combined with any other product. Loans outside of this Loan to Value (LTV) will not be allowed.\n\n2% of the balance\n\n2 Year Fixed Rate (until 30/06/2027)\n\n95%\n\n\u00a30\n\n4.90%\n\n6.4% APRC\n\nrepaid until\n\nBEBR which is variable,\n\nMinimum Loan \u00a325k\n\n30/06/2027\n\ncurrently 4.50% + 1.99%\n\nMaximum Loan \u00a3570K\n\n4% of the balance\n\n= 6.49% for term\n\n5 Year Fixed Rate (until 30/06/2030)\n\nrepaid until\n\n95%\n\n5.9% APRC\n\n4.84%\n\n\u00a30\n\n30/06/2030\n\n**Applications under the scheme must be for residential properties in the UK (excluding new-build properties) with a** **value of \u00a3600,000 or less. The property must be the applicant's only property at time of completion and the whole of the loan must be on this product and cannot be used as part of a** **porting top up.**\n\n**YOUR HOME MAY BE REPOSSESSED IF YOU DO NOT KEEP UP REPAYMENTS ON YOUR MORTGAGE Barclays Bank UK PLC. Authorised by the Prudential Regulation Authority and regulated by the Financial Conduct Authority and the Prudential Regulation Authority (Financial Services Register No. 759676). Registered in England. Registered No. 9740322 Registered Office: 1** **Churchill Place, London E14 5HP.**\n\n**Page 3** **of 7** ![Title: BARCLAYS\nThe image shows the Barclays logo, which is a blue eagle symbol with the text "BARCLAYS" below it.](s3://barclays-output/bda/output/BarclaysDocs/CoreRangeCustomerRateSheet.pdf/d4bd1574-c073-410f-82f9-8ce02e41622e/0/standard_output/0/assets/ef6756c3-75d1-4651-b446-2002473e9eeb.png)\n\nBARCLAYS\n\n# Rates Effective From 30 April 2025\n\n# Residential Remortgage Rates\n\nSwitch & Save available for loans up to \u00a32m\n\n## Offset Products\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | Maximu m Loan to Value (LTV) | Early Repayment Charge |\n|---------------------------------------------------|-----------------------------------|-------------------------|------------------------------------------------------------------------------------|-----------------------------------|---------------|--------------------------------|---------------------------------------------------------|\n| 2 Year Offset Tracker at BEBR + 1.22% for 2 Years | Minimum Loan \u00a35k Maximum Loan \u00a32m | 5.72% | Offset Tracker at BEBR which is variable, currently 4.50% + 1.99% = 6.49% for term | 6.6% APRC | \u00a31749 | 75% | Full redemption: 1% of the original balance for 2 years |\n| 5 Year Offset Tracker at BEBR + 1.25% for 5 Years | | 5.75% | | 6.5% APRC | \u00a31749 | 75% | Full redemption: 1% of the original balance for 5 years |\n\n**Offset:** Current Account and Savings Accounts can be offset against the mortgage, see Offset Terms & Conditions for more information. Please note customers can only hold one Offset Mortgage at a time.\n\n## Tracker Products\n\n**Premier Exclusive Rates -** **Customers must hold a** **Wealth /** **Premier Banking relationship with Barclays to apply for these products and qualify for retail underwriting**\n\n| | | | | | | | |\n|--------------------------------------------------------------|------------------------------------|-------|------------------------------------------------------------------|-----------|-------|-----|--------------------------------------|\n| 2 Year Tracker at BEBR + 0.21% for 2 Years | Minimum Loan \u00a35k Maximum Loan \u00a32m | 4.71% | BEBR which is variable, currently 4.50% + 1.99% = 6.49% for term | 6.4% APRC | \u00a3999 | 60% | No ERC |\n| 2 Year Tracker at BEBR + 0.36% for 2 Years | | 4.86% | | 6.4% APRC | \u00a3999 | 75% | |\n| 2 Year Tracker at BEBR + 0.76% for 2 Years | | 5.26% | | 6.5% APRC | \u00a3999 | 85% | |\n| Premier Exclusive 2 Year Tracker at BEBR + 0.50% for 2 Years | | 5.00% | | 6.4% APRC | \u00a30 | 75% | 1% of the balance repaid for 2 years |\n| 2 Year Tracker at BEBR + 0.35% for 2 Years | Minimum Loan \u00a32m Maximum Loan \u00a310m | 4.85% | | 6.4% APRC | \u00a31999 | 60% | No ERC |\n| 2 Year Tracker at BEBR + 0.55% for 2 Years | | 5.05% | | 6.4% APRC | \u00a31999 | 70% | |\n| 2 Year Tracker at BEBR + 0.57% for 2 Years | Minimum Loan \u00a32m Maximum Loan \u00a35m | 5.07% | | 6.4% APRC | \u00a31999 | 75% | |\n| 5 Year Tracker at BEBR + 0.60% for 5 Years | Minimum Loan \u00a35k Maximum Loan \u00a32m | 5.10% | | 6.1% APRC | \u00a3999 | 60% | |\n| 5 Year Tracker at BEBR + 1.00% for 5 Years | | 5.50% | | 6.3% APRC | \u00a3999 | 85% | |\n\n## 2 Year Fixed Products\n\n**Premier Exclusive Rates -** **Customers must hold a** **Wealth /** **Premier Banking relationship with Barclays to apply for these products and qualify for retail underwriting**\n\nPremier Exclusive 2 Year Fixed Rate (until 30/06/2027)\n\n60%\n\n3.95%\n\n6.2% APRC\n\n\u00a3999\n\n60%\n\n3.96%\n\n6.2% APRC\n\n\u00a3999\n\nMinimum Loan \u00a35k\n\nMaximum Loan \u00a32m\n\nBEBR which is\n\n4.19%\n\n6.3% APRC\n\n\u00a3999\n\n75%\n\n2% of the balance\n\nvariable,\n\ncurrently 4.50%\n\nrepaid\n\n5.09%\n\n\u00a3999\n\n6.5% APRC\n\n85%\n\nuntil 30/06/2027\n\n+ 1.99% =\n\n2 Year Fixed Rate (until 30/06/2027)\n\n6.49% for term\n\n\u00a31999\n\n60%\n\n4.21%\n\n6.2% APRC\n\nMinimum Loan \u00a32m\n\nMaximum Loan \u00a310m\n\n4.36%\n\n6.3% APRC\n\n70%\n\n\u00a31999\n\nMinimum Loan \u00a32m\n\n4.52%\n\n\u00a31999\n\n6.3% APRC\n\n75%\n\nMaximum Loan \u00a35m\n\n## 2 Year Fixed Products\n\n**Great Escape:** No Product Fee, Free Legals, Free non-disclosed Valuation & \u00a3150 cashback.\n\n(Great Escape: Customers use Barclays nominated Solicitor. Barclays will pay for specified remortgage fees)\n\n**Own Solicitor:** Customers provide and use their own Solicitor and receive a Free non-disclosed Valuation & \u00a3500 cashback\n\nBEBR which is\n\n4.37%\n\n60%\n\n\u00a30\n\n6.3% APRC\n\n2% of the balance\n\nvariable,\n\nMinimum Loan \u00a350k\n\n2 Year Fixed Rate (until 30/06/2027)\n\ncurrently 4.50%\n\nrepaid\n\nMaximum Loan \u00a32m\n\nuntil 30/06/2027\n\n+ 1.99% =\n\n75%\n\n6.3% APRC\n\n4.62%\n\n\u00a30\n\n6.49% for term\n\n**YOUR HOME MAY BE REPOSSESSED IF YOU DO NOT KEEP UP REPAYMENTS ON YOUR MORTGAGE Barclays Bank UK PLC. Authorised by the Prudential Regulation Authority and regulated by the Financial Conduct Authority and the Prudential Regulation Authority (Financial Services Register No. 759676). Registered in England. Registered No. 9740322 Registered Office: 1** **Churchill Place, London E14 5HP.**\n\n**Page 4** **of 7** ![Title: BARCLAYS\nThe image shows the Barclays logo, which is a blue eagle symbol with the text "BARCLAYS" below it.](s3://barclays-output/bda/output/BarclaysDocs/CoreRangeCustomerRateSheet.pdf/d4bd1574-c073-410f-82f9-8ce02e41622e/0/standard_output/0/assets/47afb445-beea-48aa-baa3-ed4e11dbbac0.png)\n\nBARCLAYS\n\n# Rates Effective From 30 April 2025\n\n# Residential Remortgage Rates\n\nSwitch & Save available for loans up to \u00a32m\n\n## 5 Year Fixed Products\n\n**Premier Exclusive Rates -** **Customers must hold a** **Wealth /** **Premier Banking relationship with Barclays to apply for these products and qualify for retail underwriting**\n\n| Product Type | Minimum/Maxim um Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | LTV | Early Repayment Charge |\n|--------------------------------------------------------|------------------------------------|-------------------------|----------------------------------------------------------------|-----------------------------------|---------------|-------|-------------------------------------------|\n| Premier Exclusive 5 Year Fixed Rate (until 30/06/2030) | Minimum Loan \u00a35k Maximum Loan \u00a32m | 3.95% | BEBR which is variable, currently 4.50% 1.99% = 6.49% for term | 5.6% APRC | \u00a3999 | 60% | 4% of the balance repaid until 30/06/2030 |\n| 5 Year Fixed Rate (until 30/06/2030) | | 3.96% | | 5.6% APRC | \u00a3999 | 60% | |\n| | | 4.09% | | 5.7% APRC | \u00a3999 | 75% | |\n| | | 4.97% | | 6.1% APRC | \u00a3999 | 85% | |\n| | Minimum Loan \u00a32m Maximum Loan \u00a310m | 4.28% | | 5.7% APRC | \u00a31999 | 60% | |\n| | | 4.37% | | 5.8% APRC | \u00a31999 | 70% | |\n| | Minimum Loan \u00a32m Maximum Loan \u00a35m | 4.47% | | 5.8% APRC | \u00a31999 | 75% | |\n\n## 5 Year Fixed Products\n\n**Great Escape:** No Product Fee, Free Legals, Free non-disclosed Valuation & \u00a3150 cashback. (Great Escape: Customers use Barclays nominated Solicitor. Barclays will pay for specified remortgage fees) **Own Solicitor:** Customers provide and use their own Solicitor and receive a Free non-disclosed Valuation & \u00a3500 cashback.\n\n5 Year Fixed Rate (until 30/06/2030)\tMinimum Loan \u00a350k Maximum Loan \u00a32m\t4.15%\tBEBR which is variable, currently 4.50% + 1.99% = 6.49% for term\t5.6% APRC\t\u00a30\t60%\t4% of the balance repaid until 30/06/2030\n\t\t4.27%\t\t5.7% APRC\t\u00a30\t75%\t\n\t\t5.07%\t\t6.1% APRC\t\u00a30\t85%\n\n## 10 Year Fixed Products\n\n10 Year Fixed Rate (until 30/06/2035)\tMinimum Loan \u00a35k Maximum Loan \u00a31m\t4.95%\tBEBR which is variable, currently 4.50% 1.99% = 6.49% for term\t5.6% APRC\t\u00a3999\t60%\t6% of the balance repaid until 30/06/2035\n\t\t5.51%\t\t6.0% APRC\t\u00a3999\t80%\n\n**YOUR HOME MAY BE REPOSSESSED IF YOU DO NOT KEEP UP REPAYMENTS ON YOUR MORTGAGE Barclays Bank UK PLC. Authorised by the Prudential Regulation Authority and regulated by the Financial Conduct Authority and the Prudential Regulation Authority (Financial Services Register No. 759676). Registered in England. Registered No. 9740322 Registered Office: 1** **Churchill Place, London E14 5HP.**\n\n**Page 5** **of 7** ![Title: BARCLAYS\nThe image shows the Barclays logo, which is a blue eagle symbol with the text "BARCLAYS" below it.](s3://barclays-output/bda/output/BarclaysDocs/CoreRangeCustomerRateSheet.pdf/d4bd1574-c073-410f-82f9-8ce02e41622e/0/standard_output/0/assets/84f389e9-f0cf-43ed-a353-da3c0c9b39fb.png)\n\nBARCLAYS\n\n# Rates Effective From 30 April 2025\n\n# Green Home Buy to Let (BTL) Purchase Rates\n\n## Barclays Green Home BTL Mortgages\n\n**These products can only be used to purchase a** **new build residential property with an Energy Efficiency Rating of 81 or higher, or an Energy Efficiency Band of A** **or B. -** **Not available for Portfolio Landlords\\***\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | LTV | Early Repayment Charge |\n|--------------------------------------|------------------------------------|-------------------------|----------------------------------------------------------------|-----------------------------------|---------------|-------|-------------------------------------------|\n| 2 Year Fixed Rate (until 30/06/2027) | Minimum Loan \u00a335k Maximum Loan \u00a31m | 5.28% | BEBR which is variable, currently 4.50% 4.49% = 8.99% for term | 8.6% APRC | \u00a31295 | 75% | 2% of the balance repaid until 30/06/2027 |\n| 5 Year Fixed Rate (until 30/06/2030) | | 4.57% | | 7.3% APRC | \u00a31295 | 75% | 4% of the balance repaid until 30/06/2030 |\n\n**Satisfactory evidence that the property has an Energy Efficiency Rating of 81 or higher, or has an Energy Efficiency Band of A** **or B, must be provided before the advance is made and shall be either: (a) a** **valid Energy Performance Certificate completed less than 10 years prior to the submission of your mortgage application; or (b) a** **valid Predicted Energy Performance Certificate if the property build phase has not been completed**\n\n## Buy to Let (BTL) Purchase & Remortgage Rates\n\nSwitch & Save available across the range\n\n**Purchase &** **Remortgage** - **Not available for Portfolio Landlords\\***\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | LTV | Early Repayment Charge |\n|--------------------------------------|-----------------------------------|-------------------------|------------------------------------------------------------------|-----------------------------------|---------------|-------|-------------------------------------------|\n| 2 Year Fixed Rate (until 30/06/2027) | Minimum Loan \u00a31m Maximum Loan \u00a32m | 5.15% | BEBR which is variable, currently 4.50% + 4.49% = 8.99% for term | 8.5% APRC | \u00a32495 | 60% | 2% of the balance repaid until 30/06/2027 |\n| 5 Year Fixed Rate (until 30/06/2030) | Minimum Loan \u00a31m Maximum Loan \u00a32m | 4.80% | | 7.4% APRC | \u00a32495 | 60% | 4% of the balance repaid until 30/06/2030 |\n\n**Purchase Only- Not available for Portfolio Landlords\\***\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | LTV | Early Repayment Charge |\n|--------------------------------------|------------------------------------|-------------------------|------------------------------------------------------------------|-----------------------------------|---------------|-------|-------------------------------------------|\n| 2 Year Fixed Rate (until 30/06/2027) | Minimum Loan \u00a335k Maximum Loan \u00a31m | 5.38% | BEBR which is variable, currently 4.50% + 4.49% = 8.99% for term | 8.6% APRC | \u00a31295 | 75% | 2% of the balance repaid until 30/06/2027 |\n| 5 Year Fixed Rate (until 30/06/2030) | | 4.67% | | 7.4% APRC | \u00a31295 | 75% | 4% of the balance repaid until 30/06/2030 |\n\n**Remortgage Only- Not available for Portfolio Landlords\\***\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | LTV | Early Repayment Charge |\n|--------------------------------------|------------------------------------|-------------------------|------------------------------------------------------------------|-----------------------------------|---------------|-------|-------------------------------------------|\n| 2 Year Fixed Rate (until 30/06/2027) | Minimum Loan \u00a335k Maximum Loan \u00a31m | 5.15% | BEBR which is variable, currently 4.50% + 4.49% = 8.99% for term | 8.5% APRC | \u00a30 | 60% | 2% of the balance repaid until 30/06/2027 |\n| | | 5.01% | | 8.6% APRC | \u00a31795 | 75% | |\n| 5 Year Fixed Rate (until 30/06/2030) | | 4.57% | | 7.3% APRC | \u00a30 | 60% | 4% of the balance repaid until 30/06/2030 |\n| | | 4.41% | | 7.3% APRC | \u00a31795 | 75% | |\n| | | 4.79% | | 7.4% APRC | \u00a30 | 75% | |\n\n**Portfolio Landlords\\* Purchase &** **Remortgage Available for Portfolio Landlords\\* Only**\n\n| Product Type | Minimum/Maximum Loan size | Initial Interest Rate | Follow on rate | The Overall Cost for Comparison | Product Fee | LTV | Early Repayment Charge |\n|--------------------------------------|------------------------------------|-------------------------|------------------------------------------------------------------|-----------------------------------|---------------|-------|-------------------------------------------|\n| 2 Year Fixed Rate (until 30/06/2027) | Minimum Loan \u00a335k Maximum Loan \u00a31m | 5.01% | BEBR which is variable, currently 4.50% + 4.49% = 8.99% for term | 8.6% APRC | \u00a32495 | 75% | 2% of the balance repaid until 30/06/2027 |\n\n\\*Portfolio Landlords are those with four mortgaged rental properties or more across all lenders. This includes the subject property\n\n**YOUR HOME MAY BE REPOSSESSED IF YOU DO NOT KEEP UP REPAYMENTS ON YOUR MORTGAGE Barclays Bank UK PLC. Authorised by the Prudential Regulation Authority and regulated by the Financial Conduct Authority and the Prudential Regulation Authority (Financial Services Register No. 759676). Registered in England. Registered No. 9740322 Registered Office: 1** **Churchill Place, London E14 5HP.**\n\n**Page 6** **of 7** ![Title: BARCLAYS\nThe image shows the Barclays logo, which is a blue eagle symbol with the text "BARCLAYS" below it.](s3://barclays-output/bda/output/BarclaysDocs/CoreRangeCustomerRateSheet.pdf/d4bd1574-c073-410f-82f9-8ce02e41622e/0/standard_output/0/assets/d349177f-da79-48d1-a566-a4f5fb2c7af2.png)\n\nBARCLAYS\n\n# Rates Effective From 30 April 2025\n\n# Features of our Buy to Let Mortgage Products:\n\n## Buy to Let Affordability:\n\nBarclays uses 1 of 2 methods to calculate affordability. Dependent on eligibility this will either be an Interest Coverage Ratio (ICR) or a detailed affordability assessment considering both personal and rental income. The income affordability assessment includes all relevant landlord costs, applicant level tax liability and is assessed against the Bank's affordability rate.\n\n## LTV: Loan to Value, this is based on the lower of the purchase price or our valuation. Large Loans: Buy-to-Let loans over \u00a32million are not available at present. This\n\nincludes rate switches on existing loans over \u00a32m and existing loans where additional borrowing is required which will take the total loan over \u00a32 million.\n\n**Switch &** **Save\u1d40 remortgage package:** Our standard legal service will include solicitor's fees directly relating to the remortgage (registered land only) and Land Registry fees. It does not include any other legal fees, money transfer fees or additional fees incurred in connection with dealing with leasehold, shared ownership properties or registration fees relating to Sasine properties in Scotland. It also excludes other mortgage charges or changes to mortgage parties. So, in the case of a remortgage where it includes, for example, a change of names on the mortgage or the first registration of unregistered land, you will be liable for any associated fees. You will be advised by your solicitor of additional costs applicable to your circumstances and charged separately for these.\n\n## Switch & Fix.\n\nThe Tracker products which have an Early Repayment Charge (ERC) carry a Switch & Fix facility. A product carrying a Switch & Fix may be switched to any fixed rate product, subject to availability at the time, without incurring the early repayment charge payable on this product. The switch will be subject to any product fee applicable to the new product at that time. The new product may also have an ERC which will not have the benefit of the Switch & Fix facility.\n\n**BTL Valuation Fees:** A non-disclosed valuation report will normally be prepared solely for mortgage lending purposes. If you would like to receive a disclosed valuation report, please discuss this with your Mortgage Advisor and refer to the BTL Tariff of Charges for the cost of the increased Valuation Fee.\n\n**Condition &** **Charges:** A first charge over the property will be required as security for our mortgage. Full Personal Guarantees are required from all directors/ shareholders of a SPV Limited Company. Full Personal Guarantees are required from all members of a LLP. For interest only loans you are advised to arrange a suitable repayment vehicle to repay the capital at the end of the mortgage term. All mortgages are subject to status and valuation and a minimum age of 18 for the principal borrower. Early repayment charges apply if the mortgage is repaid in whole or in part or transferred to another scheme during the initial period, unless otherwise stated.\n\n**BTL Customer Contact Numbers:** For new enquiries, please speak to the Mortgage Advisor in your local Barclays branch, or alternatively if you would like to deal with us directly call our BTL Sales and Information Team on 0333 202 7580\\* (telephone calls maybe recorded). For an update on an application which is being processed; please call the BTL Processing Unit on 0800 022 4 022. For servicing queries on completed cases; please call the BTL Customer Services Team on 4\n\n\\*Calls to 03 numbers from a landline or mobile cost no more than calls to geographic numbers (01 or 02) and are included in any inclusive minutes and discount schemes you may have.\n\n## Features of our Residential Mortgage Products:\n\n- You are strongly recommended to have life assurance in place to cover the full amount of all sums that you may borrow from us under the facility. Applications are subject to status and a minimum age of 18. Mortgages are subject to valuation. A first charge over your property is required. The mortgage is available on an interest only and / or repayment basis. For an interest only mortgage, it is your responsibility to ensure that you have sufficient funds to repay the mortgage at the end of its term. The minimum age for taking out a mortgage with Barclays is 18. Usually the maximum age at the end of the mortgage term should be 70 or your retirement age - whichever is sooner. Where the end of term date of the mortgage would be later than this for any applicant, applications may still be considered on an individual basis. Mortgages are not available for business purposes.\n\n**Charges and interest rates may vary and are correct at the time of going to print. All products are subject to availability and may be changed or withdrawn at any time without notice.**\n\n**Great Escape\u2122 and Switch &** **Save\u2122 remortgage package.**\n\nOur standard legal service will include solicitor's fees directly relating to the remortgage (registered land only) and Land Registry fees. It does not include any other legal fees, money transfer fees or additional fees incurred in connection with dealing with leasehold, shared ownership properties or registration fees relating to Sasine properties in Scotland. It also excludes other mortgage charges or changes to mortgage parties. So, in the case of a remortgage where it includes, for example, a change of names on the mortgage or the first registration of unregistered land, you will be liable for any associated fees. You will be advised by your solicitor of additional costs applicable to your circumstances and charged separately for these.\n\n## Switch & Fix.\n\nThe Offset & Tracker products which have an Early Repayment Charge (ERC) carry a Switch & Fix facility. A product carrying a Switch & Fix may be switched to any fixed rate product, subject to availability at the time, without incurring the early repayment charge payable on this product. The switch will be subject to any product fee applicable to the new product at that time. The new product may also have an ERC which will not have the benefit of the Switch & Fix facility.\n\n## Fixed Rates explained\n\n- A fixed rate provides an interest rate that remains the same during the fixed period of the loan. After the fixed period of your loan our fixed rates revert to Bank of England Base Rate plus a margin;\n- This gives you the peace of mind of knowing that your monthly repayment will stay the same for the period during which the rate is fixed, as long as payments are made for the correct amount and on time, allowing you to plan accordingly;\n- An Early Repayment Charge may apply in certain circumstances.\n\n## Tracker Rates explained\n\n- All Base Rate Trackers are linked to Bank of England Base Rate;\n- As the rate is linked to a variable rate, your monthly repayments may increase as well as decrease;\n- This is the only product type available for \"Offset\" mortgages.\n\n**YOUR HOME MAY BE REPOSSESSED IF YOU DO NOT KEEP UP REPAYMENTS ON YOUR MORTGAGE Barclays Bank UK PLC. Authorised by the Prudential Regulation Authority and regulated by the Financial Conduct Authority and the Prudential Regulation Authority (Financial Services Register No. 759676). Registered in England. Registered No. 9740322 Registered Office: 1** **Churchill Place, London E14 5HP.**\n\n**Page 7** **of 7** """ +sample_text = sample_text.replace('\\n', '\n') + +print(sample_text) + +# Add the parent directory to the path to import the modules +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..')) + +def test_character_chunker(): + """Test character-based chunking""" + try: + # Import directly from the file + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'app', 'common', 'chunkers')) + + print("\n" + "="*60) + print("TESTING CHARACTER CHUNKER") + print("="*60) + + + # Create character chunker + chunker = CharacterChunker( + chunk_size=200, + overlap_size=20 + ) + + chunks = chunker.chunk(sample_text) + + print(f"Character Chunker - Chunk Size: 200, Overlap: 20") + print(f"Total chunks: {len(chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in chunks)}") + print(f"Original text length: {len(sample_text)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:150] + "..." if len(chunk) > 150 else chunk) + + return True + + except Exception as e: + print(f"Error testing character chunker: {e}") + return False + +def test_regex_chunker(): + """Test regex-based chunking""" + try: + # Import directly from the file + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'app', 'common', 'chunkers')) + + print("\n" + "="*60) + print("TESTING REGEX CHUNKER") + print("="*60) + + + # Create regex chunker + chunker = RegexChunker(pattern="\\r?\\n") + + chunks = chunker.chunk(sample_text) + + print(f"Regex Chunker - Pattern: \\r?\\n (split on newlines)") + print(f"Total chunks: {len(chunks)}") + + for i, chunk in enumerate(chunks): + if chunk.strip(): # Only show non-empty chunks + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk.strip()) + if len(chunk) > 100: + print("...") + + return True + + except Exception as e: + print(f"Error testing regex chunker: {e}") + return False + +def test_markdown_chunker(): + """Test markdown-based chunking""" + try: + # Import directly from the file + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'app', 'common', 'chunkers')) + + print("\n" + "="*60) + print("TESTING MARKDOWN CHUNKER") + print("="*60) + + + # Create markdown chunker + chunker = MarkdownChunker( + chunk_size=300, + chunk_overlap=30 + ) + + chunks = chunker.chunk(sample_text) + + print(f"Markdown Chunker - Chunk Size: 300, Overlap: 30") + print(f"Total chunks: {len(chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in chunks)}") + print(f"Original text length: {len(sample_text)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:150] + "..." if len(chunk) > 150 else chunk) + + return True + + except Exception as e: + print(f"Error testing markdown chunker: {e}") + return False + +def test_recursive_chunker(): + """Test recursive-based chunking""" + try: + # Import directly from the file + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'app', 'common', 'chunkers')) + + print("\n" + "="*60) + print("TESTING RECURSIVE CHUNKER") + print("="*60) + + + # Create recursive chunker + chunker = RecursiveChunker( + chunk_size=250, + overlap_size=25 + ) + + chunks = chunker.chunk(sample_text) + + print(f"Recursive Chunker - Chunk Size: 250, Overlap: 25") + print(f"Total chunks: {len(chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in chunks)}") + print(f"Original text length: {len(sample_text)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:150] + "..." if len(chunk) > 150 else chunk) + + return True + + except Exception as e: + print(f"Error testing recursive chunker: {e}") + return False + +def test_edge_cases(): + """Test chunkers with edge cases""" + try: + # Import directly from the file + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'app', 'common', 'chunkers')) + + print("\n" + "="*60) + print("TESTING EDGE CASES") + print("="*60) + + chunker = CharacterChunker(chunk_size=100) + + # Test with empty string + empty_text = "" + print("\n--- Testing with empty string ---") + + chunks = chunker.chunk(empty_text) + print(f"Empty string chunks: {chunks}") + + # Test with very short text + short_text = "Hello" + print("\n--- Testing with short text ---") + + chunks = chunker.chunk(short_text) + print(f"Short text chunks: {chunks}") + + # Test with text exactly chunk size + exact_text = "A" * 100 + print("\n--- Testing with text exactly chunk size ---") + + chunks = chunker.chunk(exact_text) + print(f"Exact chunk size chunks: {len(chunks)}") + + return True + + except Exception as e: + print(f"Error testing edge cases: {e}") + return False + +def test_chunker_comparison(): + """Compare different chunkers with the same text""" + try: + # Import directly from the file + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'app', 'common', 'chunkers')) + + print("\n" + "="*60) + print("CHUNKER COMPARISON") + print("="*60) + + # Sample text + sample_text = """This is a sample text for testing chunkers. + +It has multiple paragraphs and lines. + +We can see how different chunkers handle this text.""" + + print(f"Sample text: {repr(sample_text)}") + print(f"Text length: {len(sample_text)} characters") + + # Test character chunker + char_chunker = CharacterChunker(chunk_size=50, overlap_size=5) + char_chunks = char_chunker.chunk(sample_text) + + print(f"\nCharacter Chunker (size=50, overlap=5):") + print(f" Total chunks: {len(char_chunks)}") + for i, chunk in enumerate(char_chunks): + print(f" Chunk {i+1}: {repr(chunk)}") + + # Test regex chunker + regex_chunker = RegexChunker(pattern="\\r?\\n") + regex_chunks = regex_chunker.chunk(sample_text) + + print(f"\nRegex Chunker (pattern=\\r?\\n):") + print(f" Total chunks: {len(regex_chunks)}") + for i, chunk in enumerate(regex_chunks): + print(f" Chunk {i+1}: {repr(chunk)}") + + return True + + except Exception as e: + print(f"Error testing chunker comparison: {e}") + return False + +def main(): + """Main function to run all tests""" + print("=" * 80) + print("DIRECT CHUNKER TESTING") + print("=" * 80) + + results = [] + + # Test each chunker + results.append(("Character Chunker", test_character_chunker())) + results.append(("Regex Chunker", test_regex_chunker())) + results.append(("Markdown Chunker", test_markdown_chunker())) + results.append(("Recursive Chunker", test_recursive_chunker())) + results.append(("Edge Cases", test_edge_cases())) + results.append(("Chunker Comparison", test_chunker_comparison())) + + # Print summary + print("\n" + "=" * 80) + print("TEST SUMMARY") + print("=" * 80) + + for test_name, success in results: + status = "✓ PASS" if success else "✗ FAIL" + print(f"{test_name}: {status}") + + passed = sum(1 for _, success in results if success) + total = len(results) + + print(f"\nOverall: {passed}/{total} tests passed") + + if passed == total: + print("🎉 All tests passed!") + else: + print("⚠️ Some tests failed. Check the output above for details.") + +if __name__ == "__main__": + main() + diff --git a/ecc/tests/test_chunkers_simple.py b/ecc/tests/test_chunkers_simple.py new file mode 100644 index 0000000..fb01732 --- /dev/null +++ b/ecc/tests/test_chunkers_simple.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +""" +Simple test script for testing different chunkers with sample text. +This version focuses on basic chunkers that don't require external dependencies. +""" + +import sys +import os + +# Add the parent directory to the path to import the modules +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', '..')) + +def test_character_chunker(): + """Test character-based chunking""" + try: + from common.chunkers.character_chunker import CharacterChunker + + print("\n" + "="*60) + print("TESTING CHARACTER CHUNKER") + print("="*60) + + # Sample text for testing + sample_text = """# Introduction to GraphRAG + +GraphRAG is a powerful framework for building Retrieval-Augmented Generation (RAG) systems using graph databases. + +## What is RAG? + +Retrieval-Augmented Generation (RAG) is a technique that combines the power of large language models with external knowledge retrieval. It allows AI systems to access and use information that wasn't part of their training data. + +## Key Components + +1. **Document Ingestion**: Documents are processed and chunked into smaller pieces +2. **Embedding Generation**: Each chunk is converted into a vector representation +3. **Vector Storage**: Embeddings are stored in a vector database for efficient retrieval +4. **Query Processing**: User queries are processed and relevant chunks are retrieved +5. **Response Generation**: The LLM generates responses based on retrieved context + +## Benefits + +- Improved accuracy through access to current information +- Reduced hallucination by grounding responses in retrieved facts +- Scalable knowledge management +- Cost-effective compared to fine-tuning + +This framework provides a robust foundation for building enterprise-grade RAG applications.""" + + # Create character chunker + chunker = CharacterChunker( + chunk_size=200, + overlap_size=20 + ) + + chunks = chunker.chunk(sample_text) + + print(f"Character Chunker - Chunk Size: 200, Overlap: 20") + print(f"Total chunks: {len(chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in chunks)}") + print(f"Original text length: {len(sample_text)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:150] + "..." if len(chunk) > 150 else chunk) + + return True + + except Exception as e: + print(f"Error testing character chunker: {e}") + return False + +def test_regex_chunker(): + """Test regex-based chunking""" + try: + from common.chunkers.regex_chunker import RegexChunker + + print("\n" + "="*60) + print("TESTING REGEX CHUNKER") + print("="*60) + + # Sample text for testing + sample_text = """# Introduction to GraphRAG + +GraphRAG is a powerful framework for building Retrieval-Augmented Generation (RAG) systems using graph databases. + +## What is RAG? + +Retrieval-Augmented Generation (RAG) is a technique that combines the power of large language models with external knowledge retrieval. It allows AI systems to access and use information that wasn't part of their training data. + +## Key Components + +1. **Document Ingestion**: Documents are processed and chunked into smaller pieces +2. **Embedding Generation**: Each chunk is converted into a vector representation +3. **Vector Storage**: Embeddings are stored in a vector database for efficient retrieval +4. **Query Processing**: User queries are processed and relevant chunks are retrieved +5. **Response Generation**: The LLM generates responses based on retrieved context + +## Benefits + +- Improved accuracy through access to current information +- Reduced hallucination by grounding responses in retrieved facts +- Scalable knowledge management +- Cost-effective compared to fine-tuning + +This framework provides a robust foundation for building enterprise-grade RAG applications.""" + + # Create regex chunker + chunker = RegexChunker(pattern="\\r?\\n") + + chunks = chunker.chunk(sample_text) + + print(f"Regex Chunker - Pattern: \\r?\\n (split on newlines)") + print(f"Total chunks: {len(chunks)}") + + for i, chunk in enumerate(chunks): + if chunk.strip(): # Only show non-empty chunks + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk.strip()) + if len(chunk) > 100: + print("...") + + return True + + except Exception as e: + print(f"Error testing regex chunker: {e}") + return False + +def test_markdown_chunker(): + """Test markdown-based chunking""" + try: + from common.chunkers.markdown_chunker import MarkdownChunker + + print("\n" + "="*60) + print("TESTING MARKDOWN CHUNKER") + print("="*60) + + # Sample text for testing + sample_text = """# Introduction to GraphRAG + +GraphRAG is a powerful framework for building Retrieval-Augmented Generation (RAG) systems using graph databases. + +## What is RAG? + +Retrieval-Augmented Generation (RAG) is a technique that combines the power of large language models with external knowledge retrieval. It allows AI systems to access and use information that wasn't part of their training data. + +## Key Components + +1. **Document Ingestion**: Documents are processed and chunked into smaller pieces +2. **Embedding Generation**: Each chunk is converted into a vector representation +3. **Vector Storage**: Embeddings are stored in a vector database for efficient retrieval +4. **Query Processing**: User queries are processed and relevant chunks are retrieved +5. **Response Generation**: The LLM generates responses based on retrieved context + +## Benefits + +- Improved accuracy through access to current information +- Reduced hallucination by grounding responses in retrieved facts +- Scalable knowledge management +- Cost-effective compared to fine-tuning + +This framework provides a robust foundation for building enterprise-grade RAG applications.""" + + # Create markdown chunker + chunker = MarkdownChunker( + chunk_size=300, + chunk_overlap=30 + ) + + chunks = chunker.chunk(sample_text) + + print(f"Markdown Chunker - Chunk Size: 300, Overlap: 30") + print(f"Total chunks: {len(chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in chunks)}") + print(f"Original text length: {len(sample_text)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:150] + "..." if len(chunk) > 150 else chunk) + + return True + + except Exception as e: + print(f"Error testing markdown chunker: {e}") + return False + +def test_recursive_chunker(): + """Test recursive-based chunking""" + try: + from common.chunkers.recursive_chunker import RecursiveChunker + + print("\n" + "="*60) + print("TESTING RECURSIVE CHUNKER") + print("="*60) + + # Sample text for testing + sample_text = """# Introduction to GraphRAG + +GraphRAG is a powerful framework for building Retrieval-Augmented Generation (RAG) systems using graph databases. + +## What is RAG? + +Retrieval-Augmented Generation (RAG) is a technique that combines the power of large language models with external knowledge retrieval. It allows AI systems to access and use information that wasn't part of their training data. + +## Key Components + +1. **Document Ingestion**: Documents are processed and chunked into smaller pieces +2. **Embedding Generation**: Each chunk is converted into a vector representation +3. **Vector Storage**: Embeddings are stored in a vector database for efficient retrieval +4. **Query Processing**: User queries are processed and relevant chunks are retrieved +5. **Response Generation**: The LLM generates responses based on retrieved context + +## Benefits + +- Improved accuracy through access to current information +- Reduced hallucination by grounding responses in retrieved facts +- Scalable knowledge management +- Cost-effective compared to fine-tuning + +This framework provides a robust foundation for building enterprise-grade RAG applications.""" + + # Create recursive chunker + chunker = RecursiveChunker( + chunk_size=250, + overlap_size=25 + ) + + chunks = chunker.chunk(sample_text) + + print(f"Recursive Chunker - Chunk Size: 250, Overlap: 25") + print(f"Total chunks: {len(chunks)}") + print(f"Total characters: {sum(len(chunk) for chunk in chunks)}") + print(f"Original text length: {len(sample_text)}") + + for i, chunk in enumerate(chunks): + print(f"\n--- Chunk {i+1} (Length: {len(chunk)}) ---") + print(chunk[:150] + "..." if len(chunk) > 150 else chunk) + + return True + + except Exception as e: + print(f"Error testing recursive chunker: {e}") + return False + +def test_edge_cases(): + """Test chunkers with edge cases""" + try: + from common.chunkers.character_chunker import CharacterChunker + + print("\n" + "="*60) + print("TESTING EDGE CASES") + print("="*60) + + chunker = CharacterChunker(chunk_size=100) + + # Test with empty string + empty_text = "" + print("\n--- Testing with empty string ---") + + chunks = chunker.chunk(empty_text) + print(f"Empty string chunks: {chunks}") + + # Test with very short text + short_text = "Hello" + print("\n--- Testing with short text ---") + + chunks = chunker.chunk(short_text) + print(f"Short text chunks: {chunks}") + + # Test with text exactly chunk size + exact_text = "A" * 100 + print("\n--- Testing with text exactly chunk size ---") + + chunks = chunker.chunk(exact_text) + print(f"Exact chunk size chunks: {len(chunks)}") + + return True + + except Exception as e: + print(f"Error testing edge cases: {e}") + return False + +def main(): + """Main function to run all tests""" + print("=" * 80) + print("SIMPLE CHUNKER TESTING") + print("=" * 80) + + results = [] + + # Test each chunker + results.append(("Character Chunker", test_character_chunker())) + results.append(("Regex Chunker", test_regex_chunker())) + results.append(("Markdown Chunker", test_markdown_chunker())) + results.append(("Recursive Chunker", test_recursive_chunker())) + results.append(("Edge Cases", test_edge_cases())) + + # Print summary + print("\n" + "=" * 80) + print("TEST SUMMARY") + print("=" * 80) + + for test_name, success in results: + status = "✓ PASS" if success else "✗ FAIL" + print(f"{test_name}: {status}") + + passed = sum(1 for _, success in results if success) + total = len(results) + + print(f"\nOverall: {passed}/{total} tests passed") + + if passed == total: + print("🎉 All tests passed!") + else: + print("⚠️ Some tests failed. Check the output above for details.") + +if __name__ == "__main__": + main() + diff --git a/licenses/README.md b/licenses/README.md new file mode 100644 index 0000000..631147b --- /dev/null +++ b/licenses/README.md @@ -0,0 +1,94 @@ +This folder contains license files for the libraries used in the GraphRAG project. + + +The GraphRAG project uses various open-source libraries, each with their own license. This folder contains the actual license files for these libraries to ensure compliance and transparency. + + +We have successfully collected **34 license files** for the key libraries used in the GraphRAG project: + +- `fastapi-MIT` - MIT License +- `starlette-BSD-3-Clause` - BSD 3-Clause License +- `websockets-BSD` - BSD License +- `requests-Apache-2.0` - Apache 2.0 License + +- `pytigergraph-Apache-2.0` - Apache 2.0 License +- `pytigerdriver-Apache-2.0` - Apache 2.0 License + +- `langchain-MIT` - MIT License +- `langgraph-MIT` - MIT License +- `openai-python-MIT` - MIT License +- `pydantic-MIT` - MIT License + +- `sqlalchemy-MIT` - MIT License + +- `azure-core-MIT` - MIT License +- `google-cloud-aiplatform-Apache-2.0` - Apache 2.0 License +- `boto3-Apache-2.0` - Apache 2.0 License + +- `numpy-BSD` - BSD License +- `pandas-BSD` - BSD License +- `scikit-learn-BSD` - BSD License +- `scipy-BSD` - BSD License +- `pyarrow-Apache-2.0` - Apache 2.0 License + +- `asyncer-MIT` - MIT License +- `tenacity-Apache-2.0` - Apache 2.0 License +- `python-dotenv-BSD` - BSD License +- `pyyaml-MIT` - MIT License +- `watchfiles-MIT` - MIT License + +- `cryptography-Apache-2.0` - Apache 2.0 License +- `pycryptodome-Public-Domain-BSD` - Public Domain/BSD +- `argon2-cffi-MIT` - MIT License + +- `pytest-MIT` - MIT License + +- `lxml-BSD` - BSD License +- `pypdf-MIT` - MIT License + +- `huggingface-hub-Apache-2.0` - Apache 2.0 License + +- `sentry-sdk-BSD` - BSD License + + +- **MIT License** (15 libraries): Very permissive, compatible with all other licenses +- **Apache 2.0 License** (8 libraries): Permissive with patent protection +- **BSD License** (8 libraries): Very permissive, compatible with all other licenses +- **Public Domain/BSD** (1 library): Very permissive + + +✅ **All licenses are compatible** - The project uses primarily permissive licenses that are all compatible with each other and with commercial use. + +✅ **No GPL dependencies** - There are no GPL-licensed libraries that would require the entire project to be GPL. + +✅ **Commercial-friendly** - All the licenses allow commercial use, modification, and distribution. + + +License files follow the pattern: `library_name-license_name` +- Library names use hyphens for multi-word libraries +- License names can be multipart (e.g., Apache-2.0, BSD-3-Clause) +- No file extensions +- Examples: `langchain-MIT`, `pandas-BSD`, `requests-Apache-2.0` + + +Some libraries could not be fetched due to: +- Repository structure changes +- Different branch names +- Repository access issues + +The missing libraries are typically less critical and their licenses can be found on their respective GitHub repositories or PyPI pages. + + +These license files are provided for: +- **Compliance**: Ensuring proper attribution and license compliance +- **Transparency**: Making it clear what licenses govern the dependencies +- **Documentation**: Providing easy access to license terms for legal review + + +All license files were fetched directly from the official repositories of each library using the URLs specified in the license files themselves. + +--- + +**Total License Files**: 34 +**Last Updated**: July 31, 2024 +**Naming Pattern**: library_name-license_name diff --git a/licenses/docling-MIT b/licenses/docling-MIT new file mode 100644 index 0000000..671b116 --- /dev/null +++ b/licenses/docling-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 International Business Machines + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file From b96ab1fbc90d395e6f1953d03ad2f4596a0a99e0 Mon Sep 17 00:00:00 2001 From: Chengbiao Jin Date: Wed, 24 Jun 2026 14:00:37 -0700 Subject: [PATCH 03/33] Add agentic chat engine with MCP tool execution - Add an agentic chat mode that plans multi-step retrieval and combines structured and unstructured lookups, selectable per graph. - Default to the agentic engine and fall back to the classic engine automatically when the chat model can't do tool-calling. - Add per-graph external MCP server configuration with an admin-only setup page. - Surface the agent's plan and per-step execution in the trace log view. Refs: GML-2107, GML-2109, GML-2111, GML-2112, GML-2102, GML-1983, GML-1987 --- common/config.py | 38 ++ common/llm_services/base_llm.py | 98 ++- common/llm_services/capabilities.py | 149 ++++ common/mcp_config.py | 118 ++++ common/py_schemas/schemas.py | 39 ++ common/py_schemas/tool_io_schemas.py | 44 ++ graphrag-ui/src/main.tsx | 5 + graphrag-ui/src/pages/TraceLogs.tsx | 83 ++- .../src/pages/setup/GraphRAGConfig.tsx | 25 + .../src/pages/setup/McpServersConfig.tsx | 635 ++++++++++++++++++ graphrag-ui/src/pages/setup/SetupLayout.tsx | 9 + graphrag/app/agent/agent.py | 35 +- graphrag/app/agent/agentic_agent.py | 181 +++++ graphrag/app/agent/agentic_executor.py | 170 +++++ graphrag/app/agent/agentic_graph.py | 130 ++++ graphrag/app/agent/agentic_planner.py | 128 ++++ graphrag/app/agent/agentic_react.py | 195 ++++++ graphrag/app/agent/agentic_synthesizer.py | 76 +++ graphrag/app/main.py | 14 + graphrag/app/mcp_addons/__init__.py | 33 + graphrag/app/mcp_addons/client_manager.py | 222 ++++++ graphrag/app/mcp_addons/registry_adapter.py | 102 +++ graphrag/app/mcp_addons/result_normalize.py | 87 +++ graphrag/app/mcp_addons/runtime.py | 101 +++ graphrag/app/routers/__init__.py | 1 + graphrag/app/routers/mcp_servers.py | 316 +++++++++ graphrag/app/tools/find_existing_query.py | 289 ++++++++ graphrag/app/tools/graphrag_tools.py | 320 +++++++++ graphrag/app/tools/tg_mcp_tools.py | 167 +++++ graphrag/app/tools/tool_guards.py | 84 +++ graphrag/app/tools/tool_registry.py | 318 +++++++++ graphrag/tests/eval_agentic_vs_classic.py | 127 ++++ 32 files changed, 4331 insertions(+), 8 deletions(-) create mode 100644 common/llm_services/capabilities.py create mode 100644 common/mcp_config.py create mode 100644 graphrag-ui/src/pages/setup/McpServersConfig.tsx create mode 100644 graphrag/app/agent/agentic_agent.py create mode 100644 graphrag/app/agent/agentic_executor.py create mode 100644 graphrag/app/agent/agentic_graph.py create mode 100644 graphrag/app/agent/agentic_planner.py create mode 100644 graphrag/app/agent/agentic_react.py create mode 100644 graphrag/app/agent/agentic_synthesizer.py create mode 100644 graphrag/app/mcp_addons/__init__.py create mode 100644 graphrag/app/mcp_addons/client_manager.py create mode 100644 graphrag/app/mcp_addons/registry_adapter.py create mode 100644 graphrag/app/mcp_addons/result_normalize.py create mode 100644 graphrag/app/mcp_addons/runtime.py create mode 100644 graphrag/app/routers/mcp_servers.py create mode 100644 graphrag/app/tools/find_existing_query.py create mode 100644 graphrag/app/tools/graphrag_tools.py create mode 100644 graphrag/app/tools/tg_mcp_tools.py create mode 100644 graphrag/app/tools/tool_guards.py create mode 100644 graphrag/app/tools/tool_registry.py create mode 100644 graphrag/tests/eval_agentic_vs_classic.py diff --git a/common/config.py b/common/config.py index cd51d6a..df6419c 100644 --- a/common/config.py +++ b/common/config.py @@ -325,6 +325,44 @@ def get_graphrag_config(graphname=None): return result +def get_agent_mode(graphname=None) -> str: + """Return the chat answer engine for the graph: ``"agentic"`` (default) + or ``"classic"``. Read from ``graphrag_config.agent_mode`` with per-graph + override. The make_agent capability gate may still downgrade an + ``"agentic"`` request to classic when the chat model can't tool-call. + """ + mode = get_graphrag_config(graphname).get("agent_mode", "agentic") + return "classic" if str(mode).lower() == "classic" else "agentic" + + +def get_tool_selection_mode(graphname=None) -> str: + """Return the planner's external-tool-selection mode for the graph. + + ``"flat"`` (default) — every enabled external MCP tool is included in + every planner prompt alongside the always-on GraphRAG built-ins. + ``"purpose_filter"`` — a cheap pre-step picks relevant servers from + each spec's ``purpose`` text before assembling the planner prompt + (deferred; currently falls back to flat with a one-line warning). + """ + mode = get_graphrag_config(graphname).get("tool_selection", "flat") + mode = str(mode).lower() + return "purpose_filter" if mode == "purpose_filter" else "flat" + + +def get_mcp_servers(graphname=None): + """Return the merged, enabled external MCP server list for the graph. + + Resolution: global ``mcp_servers`` (top-level, sibling of + ``graphrag_config``) merged with per-graph ``mcp_servers``. Per-graph + entries override global ones by ``name``; ``enabled=False`` suppresses + an entry from the result. See ``common.mcp_config`` for the schema. + """ + from common.mcp_config import resolve_mcp_servers + global_list = server_config.get("mcp_servers") or [] + graph_list = _load_graph_config(graphname).get("mcp_servers") or [] + return resolve_mcp_servers(global_list, graph_list) + + PATH_PREFIX = os.getenv("PATH_PREFIX", "") PRODUCTION = os.getenv("PRODUCTION", "false").lower() == "true" diff --git a/common/llm_services/base_llm.py b/common/llm_services/base_llm.py index e5f04dc..d5a235e 100644 --- a/common/llm_services/base_llm.py +++ b/common/llm_services/base_llm.py @@ -144,6 +144,82 @@ def invoke_with_parser( return parser.parse(json_match.group()) raise + def invoke_with_tools( + self, + messages: list, + tools: list, + caller_name: str = "unknown", + tool_choice=None, + ): + """Invoke the chat model with tool schemas bound. + + Used by the agentic engine. Returns the raw ``AIMessage`` — read + ``resp.tool_calls`` (a list of ``{"name", "args", "id"}``) when the + model wants to call tools, or ``resp.content`` for a final message. + Usage is tracked the same way ``invoke_with_parser`` does. + + Args: + messages: LangChain messages (or ``(role, content)`` tuples). + tools: tool definitions accepted by ``bind_tools`` — LangChain + tool objects, pydantic classes, or JSON-schema dicts. + tool_choice: optional; force a tool, ``"any"``, or ``"auto"``. + """ + if tool_choice is not None: + bound = self.llm.bind_tools(tools, tool_choice=tool_choice) + else: + bound = self.llm.bind_tools(tools) + + usage_data = {} + with get_openai_callback() as cb: + resp = bound.invoke(messages) + usage_data["input_tokens"] = cb.prompt_tokens + usage_data["output_tokens"] = cb.completion_tokens + usage_data["total_tokens"] = cb.total_tokens + usage_data["cost"] = cb.total_cost + logger.info(f"{caller_name} usage: {usage_data}") + _record_usage(caller_name, usage_data) + return resp + + def invoke_structured( + self, + messages: list, + schema, + caller_name: str = "unknown", + ): + """Invoke the chat model with native structured output. + + Returns an instance of ``schema`` (a pydantic class). Used by the + planner to get a typed ``Plan`` back. Falls back to a JSON-extraction + parse when the provider's structured-output path returns text. + """ + usage_data = {} + with get_openai_callback() as cb: + try: + structured = self.llm.with_structured_output(schema) + result = structured.invoke(messages) + except Exception as exc: + logger.warning( + f"{caller_name}: structured output failed ({exc}); " + "falling back to parser" + ) + parser = PydanticOutputParser(pydantic_object=schema) + raw = self.llm.invoke(messages) + raw_text = raw.content if hasattr(raw, "content") else str(raw) + try: + result = parser.parse(raw_text) + except OutputParserException: + json_match = re.search(r"\{[\s\S]*\}", raw_text) + if not json_match: + raise + result = parser.parse(json_match.group()) + usage_data["input_tokens"] = cb.prompt_tokens + usage_data["output_tokens"] = cb.completion_tokens + usage_data["total_tokens"] = cb.total_tokens + usage_data["cost"] = cb.total_cost + logger.info(f"{caller_name} usage: {usage_data}") + _record_usage(caller_name, usage_data) + return result + async def ainvoke_with_parser( self, prompt: BasePromptTemplate, @@ -301,7 +377,27 @@ def entity_relationship_extraction_prompt(self): - Follow these rules strictly. Non-compliance, including poor formatting, results in termination. ## No-Relationship Nodes -- Include nodes that have no relationships. Add the node and leave the relationships section empty.""" +- Include nodes that have no relationships. Add the node and leave the relationships section empty. + +## Chunk Summary (Contextual Retrieval) +In addition to ``nodes`` and ``rels``, populate a ``summary`` object with +the chunk's metadata. The summary is concatenated with the chunk text +before embedding to make retrieval match natural-language questions +more reliably on table-heavy and numeric content. + +- ``topic`` — one short noun phrase (≤12 chars) naming what the chunk + is primarily about, in the source language. +- ``section`` — the heading or section title this chunk falls under, + copied verbatim from the source when present; empty string otherwise. +- ``entities`` — list of proper nouns / categories / years explicitly + named in the chunk (e.g. company names, prefecture names, regulatory + bodies, fiscal years). When the chunk contains a table, also include + every column header and row label (e.g. ``"2021年 総預貯金残高"``, + ``"2011-21年 預貯金変化率 個人預金"``) — these carry the dimensional + vocabulary a query is most likely to match on. Skip generic terms. + +Same faithfulness rule applies: only include items explicitly present +in the text — never infer or guess.""" @property def generate_cypher_prompt(self): diff --git a/common/llm_services/capabilities.py b/common/llm_services/capabilities.py new file mode 100644 index 0000000..ff265ec --- /dev/null +++ b/common/llm_services/capabilities.py @@ -0,0 +1,149 @@ +# Copyright (c) 2024-2026 TigerGraph, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Per provider/model capability map for the agentic chat engine. + +The agentic path needs reliable **tool-calling**; "deep thinking" mode +additionally benefits from **extended thinking / reasoning**. Detection +is heuristic and conservative — when unsure we return ``False`` so the +agentic engine falls back to the classic LangGraph path rather than +failing at runtime. + +The map keys on the resolved chat-config's ``llm_service`` (provider) +and ``llm_model`` (model id), matching the shapes produced by +``get_chat_config`` / ``get_llm_service``. +""" + +import logging + +logger = logging.getLogger(__name__) + +# Region-prefixed Bedrock inference profiles (us./eu./apac./us-gov.) are +# stripped before matching, so "us.anthropic.claude-..." matches the same +# family entry as "anthropic.claude-...". +_BEDROCK_REGION_PREFIXES = ("us.", "eu.", "apac.", "us-gov.") + + +def _strip_region(model: str) -> str: + for p in _BEDROCK_REGION_PREFIXES: + if model.startswith(p): + return model[len(p):] + return model + + +def _bedrock_tool_calling(model: str) -> bool: + # Anthropic Claude 3+/4, Amazon Nova, Cohere Command-R, Mistral + # Large, and Meta Llama 3.1+ support Bedrock tool use. Older Titan / + # Llama 2 / AI21 Jurassic do not. + return ( + "anthropic.claude-3" in model + or "anthropic.claude-sonnet-4" in model + or "anthropic.claude-opus-4" in model + or "anthropic.claude-haiku-4" in model + or "amazon.nova" in model + or "cohere.command-r" in model + or "mistral.mistral-large" in model + or "meta.llama3-1" in model + or "meta.llama3-2" in model + or "meta.llama3-3" in model + ) + + +def _bedrock_thinking(model: str) -> bool: + # Anthropic extended thinking landed with Claude 3.7 / Sonnet 4 / 4.5 + # and the Opus 4 family. + return ( + "anthropic.claude-3-7" in model + or "anthropic.claude-sonnet-4" in model + or "anthropic.claude-opus-4" in model + ) + + +def _openai_tool_calling(model: str) -> bool: + # GPT-4 family, GPT-4o, GPT-4.1, GPT-5, o-series, and recent + # gpt-3.5-turbo all support function/tool calling. + return ( + model.startswith("gpt-4") + or model.startswith("gpt-5") + or model.startswith("o1") + or model.startswith("o3") + or model.startswith("o4") + or "gpt-3.5-turbo" in model + ) + + +def _openai_thinking(model: str) -> bool: + return ( + model.startswith("o1") + or model.startswith("o3") + or model.startswith("o4") + or model.startswith("gpt-5") + ) + + +def _gemini_tool_calling(model: str) -> bool: + # Gemini 1.5+ and 2.x support function calling. + return "gemini-1.5" in model or "gemini-2" in model or "gemini-exp" in model + + +def _gemini_thinking(model: str) -> bool: + return "gemini-2.5" in model or "thinking" in model + + +def model_capabilities(config: dict) -> dict: + """Return ``{"supports_tool_calling": bool, "supports_thinking": bool}`` + for a resolved chat-LLM config. Conservative: unknown → ``False``. + """ + if not isinstance(config, dict): + return {"supports_tool_calling": False, "supports_thinking": False} + + service = (config.get("llm_service") or "").strip().lower() + model = (config.get("llm_model") or "").strip().lower() + model = _strip_region(model) + + tool_calling = False + thinking = False + + if service in ("bedrock", "aws_bedrock", "awsbedrock"): + tool_calling = _bedrock_tool_calling(model) + thinking = _bedrock_thinking(model) + elif service in ("openai", "azure", "azure_openai", "azureopenai"): + tool_calling = _openai_tool_calling(model) + thinking = _openai_thinking(model) + elif service in ("vertexai", "google_vertexai", "genai", "google_genai", "googlegenai"): + tool_calling = _gemini_tool_calling(model) + thinking = _gemini_thinking(model) + elif service == "groq": + # Groq exposes tool use on Llama 3.1+/3.3 and Mixtral. + tool_calling = "llama-3.1" in model or "llama-3.3" in model or "llama3-groq" in model or "mixtral" in model + elif service == "ollama": + # Local models vary; only the families we've verified for tool use. + tool_calling = "llama3.1" in model or "llama3.2" in model or "qwen2.5" in model or "mistral-nemo" in model + # sagemaker / watsonx / huggingface endpoints: leave both False + # (no reliable, uniform tool-calling guarantee) → classic fallback. + + return {"supports_tool_calling": tool_calling, "supports_thinking": thinking} + + +def model_supports_agentic(config: dict) -> bool: + """Gate for the agentic engine: requires reliable tool-calling.""" + caps = model_capabilities(config) + if not caps["supports_tool_calling"]: + logger.info( + "Agentic mode unavailable for llm_service=%r llm_model=%r " + "(no tool-calling support); using classic engine.", + (config or {}).get("llm_service"), + (config or {}).get("llm_model"), + ) + return caps["supports_tool_calling"] diff --git a/common/mcp_config.py b/common/mcp_config.py new file mode 100644 index 0000000..6f1551e --- /dev/null +++ b/common/mcp_config.py @@ -0,0 +1,118 @@ +# Copyright (c) 2024-2026 TigerGraph, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""External MCP-server config. + +Typed schema and merge logic for ``mcp_servers``, the top-level config +section (sibling of ``graphrag_config``) that catalogs outside Model +Context Protocol servers the agentic engine may dispatch tools to. + +Two scopes — global (``configs/server_config.json``) and per-graph +(``configs/graph_configs//server_config.json``). Per-graph entries +override global ones by ``name``; a per-graph entry with ``enabled=False`` +acts as a tombstone that suppresses a same-named global entry. + +The MCP client manager consumes ``resolve_mcp_servers(...)`` and wires +each enabled spec into the agentic tool registry. +""" + +from __future__ import annotations + +from typing import Dict, List, Literal, Optional + +from pydantic import BaseModel, Field, field_validator, model_validator + + +class McpServerSpec(BaseModel): + """One external MCP server. + + Tool names this server exposes are surfaced to the planner under the + ``"."`` namespace (e.g. ``"weather.get_forecast"``) so + they never collide with the built-in GraphRAG tools. + """ + + name: str = Field(min_length=1, description="Unique within scope. Becomes the planner-visible tool prefix.") + transport: Literal["stdio", "http"] + enabled: bool = True + description: str = "" + # One-paragraph hint of what data lives here and when to use it. + # Surfaced only when ``graphrag_config.tool_selection`` is set to + # ``"purpose_filter"`` (deferred); ignored in the default ``"flat"`` + # mode. + purpose: str = "" + + # stdio + command: Optional[str] = None + args: List[str] = Field(default_factory=list) + env: Dict[str, str] = Field(default_factory=dict) + + # http + url: Optional[str] = None + headers: Dict[str, str] = Field(default_factory=dict) + + # identity + forward_user: bool = False + user_header: str = "X-User" + + # security + allowed_tools: List[str] = Field(default_factory=lambda: ["*"]) + + @field_validator("name") + @classmethod + def _name_no_dot(cls, v: str) -> str: + # "." is the registry namespace separator between server and tool + # names; allowing it inside a server name would make dispatch + # ambiguous. + if "." in v: + raise ValueError("name must not contain '.'") + return v + + @model_validator(mode="after") + def _transport_requirements(self) -> "McpServerSpec": + if self.transport == "stdio" and not self.command: + raise ValueError("stdio transport requires 'command'") + if self.transport == "http" and not self.url: + raise ValueError("http transport requires 'url'") + return self + + +def resolve_mcp_servers( + global_raw: Optional[List[dict]], + graph_raw: Optional[List[dict]], +) -> List[McpServerSpec]: + """Merge global and per-graph specs; return enabled set. + + - Order: global entries first (in their declared order), then per-graph + entries that introduce new names. + - Override: when both scopes declare the same ``name``, the per-graph + entry replaces the global one in-place (its declared order slot). + - Tombstone: ``enabled=False`` removes the entry from the returned + list, whether the disable comes from global or per-graph. + """ + by_name: Dict[str, McpServerSpec] = {} + order: List[str] = [] + + for raw in global_raw or []: + spec = McpServerSpec(**raw) + if spec.name not in by_name: + order.append(spec.name) + by_name[spec.name] = spec + + for raw in graph_raw or []: + spec = McpServerSpec(**raw) + if spec.name not in by_name: + order.append(spec.name) + by_name[spec.name] = spec # per-graph wins + + return [by_name[n] for n in order if by_name[n].enabled] diff --git a/common/py_schemas/schemas.py b/common/py_schemas/schemas.py index cd46fa6..e8889c6 100644 --- a/common/py_schemas/schemas.py +++ b/common/py_schemas/schemas.py @@ -53,6 +53,38 @@ class GraphRAGResponse(BaseModel): query_sources: Dict = None +# --- Agentic engine (v2.0 deep-thinking mode) ------------------------------ + +class PlanStep(BaseModel): + """One step in an agentic plan DAG. + + ``kind`` is advisory; ``tool`` is the registry tool name actually run. + ``arg_bindings`` maps an arg name to ``"."`` and is + resolved from earlier ``StepResult`` contexts just before the call — this + is how a later structural/unstructured step consumes an earlier one. + """ + id: str + kind: str = "unstructured" # schema | structural | unstructured | answer + tool: str + args: Dict = {} + arg_bindings: Dict[str, str] = {} + depends_on: List[str] = [] + rationale: str = "" + + +class Plan(BaseModel): + steps: List[PlanStep] = [] + strategy: str = "" # one-line, user-facing summary + + +class StepResult(BaseModel): + step_id: str + ok: bool + summary: str = "" + context: Optional[object] = None + citations: List[Dict] = [] + + class BatchDocumentIngest(BaseModel): service: str service_params: dict @@ -97,6 +129,13 @@ class DocumentChunk(BaseModel): chunk_embedding: List[float] = None entities: List[Dict] = None relationships: List[Dict] = None + # Set by the page- and structure-aware chunker (v2.0). None for chunks + # written by the legacy char-count chunkers. + chunk_kind: str = None + page_no: int = None + under_heading: str = None + continues_from_page: int = None + continues_to_page: int = None class Document(BaseModel): diff --git a/common/py_schemas/tool_io_schemas.py b/common/py_schemas/tool_io_schemas.py index 474212f..b680af3 100644 --- a/common/py_schemas/tool_io_schemas.py +++ b/common/py_schemas/tool_io_schemas.py @@ -85,6 +85,40 @@ class Relationship(BaseRelationship): ) +class ChunkSummary(BaseModel): + """Compact metadata summary for a chunk, used to augment its dense + embedding so retrieval matches natural-language queries more + reliably on table-heavy and numeric content. Tag-line format keeps + each field short and clusterable per keyword. + """ + + topic: str = Field( + "", + description=( + "One short noun phrase (<= 12 chars) naming what this chunk is " + "primarily about. In the source language." + ), + ) + section: str = Field( + "", + description=( + "The heading or section title this chunk falls under, copied " + "verbatim from the source when present; empty string otherwise." + ), + ) + entities: List[str] = Field( + default_factory=list, + description=( + "Proper nouns / named entities / categories mentioned in the " + "chunk (e.g. company names, prefecture names, years, " + "regulatory bodies). When the chunk contains a table, include " + "every column header / row label as an entity too — they carry " + "the dimensional vocabulary a retrieval query is most likely to " + "match on. Used for keyword-style retrieval signals." + ), + ) + + class KnowledgeGraph(BaseModel): """Generate a knowledge graph with entities and relationships.""" @@ -92,6 +126,16 @@ class KnowledgeGraph(BaseModel): rels: List[Relationship] = Field( ..., description="List of relationships in the knowledge graph" ) + summary: Optional[ChunkSummary] = Field( + default=None, + description=( + "Compact metadata summary for the chunk. Used by Contextual " + "Retrieval — concatenated with the raw text before embedding so " + "dense vectors carry the chunk's topic / entities / values " + "explicitly. Optional: parsers tolerate missing summaries from " + "legacy outputs." + ), + ) class ReportQuestion(BaseModel): diff --git a/graphrag-ui/src/main.tsx b/graphrag-ui/src/main.tsx index 69a77e5..53239a5 100755 --- a/graphrag-ui/src/main.tsx +++ b/graphrag-ui/src/main.tsx @@ -11,6 +11,7 @@ import IngestGraph from "./pages/setup/IngestGraph.tsx"; import LLMConfig from "./pages/setup/LLMConfig.tsx"; import GraphDBConfig from "./pages/setup/GraphDBConfig.tsx"; import GraphRAGConfig from "./pages/setup/GraphRAGConfig.tsx"; +import McpServersConfig from "./pages/setup/McpServersConfig.tsx"; import CustomizePrompts from "./pages/setup/CustomizePrompts.tsx"; import { ThemeProvider } from "./components/ThemeProvider.tsx"; import { ModeToggle } from "@/components/ModeToggle.tsx"; @@ -94,6 +95,10 @@ const router = createBrowserRouter([ path: "server-config/graphrag", element: , }, + { + path: "server-config/mcp-servers", + element: , + }, { path: "prompts", element: , diff --git a/graphrag-ui/src/pages/TraceLogs.tsx b/graphrag-ui/src/pages/TraceLogs.tsx index 821059b..a038655 100644 --- a/graphrag-ui/src/pages/TraceLogs.tsx +++ b/graphrag-ui/src/pages/TraceLogs.tsx @@ -63,6 +63,19 @@ interface TimelineStep { durationMs: number; } +interface PlanStepInfo { + id: string; + kind: string; + tool: string; + rationale?: string; + depends_on?: string[]; +} + +interface PlanInfo { + strategy: string; + steps: PlanStepInfo[]; +} + interface TraceData { originalQuery: string; conversationContext: string[]; @@ -81,6 +94,7 @@ interface TraceData { timeline: TimelineStep[]; tokenUsage: TokenUsage; finalResponse: string; + plan: PlanInfo | null; } // ─── Helpers ────────────────────────────────────────────────────────────────── @@ -243,6 +257,7 @@ function buildTraceFromMessage(message: any, userQuery?: string): TraceData { timeline, tokenUsage, finalResponse: message?.content || "", + plan: qs.plan && Array.isArray(qs.plan.steps) ? (qs.plan as PlanInfo) : null, }; } @@ -353,6 +368,56 @@ const ExpandableRow: FC<{ // ─── Tab Panels ─────────────────────────────────────────────────────────────── +const KIND_COLORS: Record = { + structural: "bg-indigo-100 dark:bg-indigo-900/30 text-indigo-700 dark:text-indigo-300", + unstructured: "bg-emerald-100 dark:bg-emerald-900/30 text-emerald-700 dark:text-emerald-300", + schema: "bg-amber-100 dark:bg-amber-900/30 text-amber-700 dark:text-amber-300", + answer: "bg-purple-100 dark:bg-purple-900/30 text-purple-700 dark:text-purple-300", +}; + +const PlanPanel: FC<{ trace: TraceData }> = ({ trace }) => { + const plan = trace.plan; + if (!plan) { + return ( +

            + No plan available — this answer used the classic engine (the agentic + engine produces a plan). +

            + ); + } + return ( +
            + {plan.strategy && ( +
            + Strategy +

            {plan.strategy}

            +
            + )} +
              + {plan.steps.map((s, i) => ( +
            1. +
              + {s.id} + + {s.kind} + + {s.tool && {s.tool}} + {s.depends_on && s.depends_on.length > 0 && ( + + ← depends on {s.depends_on.join(", ")} + + )} +
              + {s.rationale && ( +

              {s.rationale}

              + )} +
            2. + ))} +
            +
            + ); +}; + const LogsPanel: FC<{ trace: TraceData }> = ({ trace }) => { const [collapsed, setCollapsed] = useState(false); @@ -928,8 +993,19 @@ const TraceLogs: FC = ({ messageIdProp, onClose }) => { {/* Tabs */} - + + {trace.plan && ( + + Plan + + {trace.plan.steps.length} + + + )} = ({ messageIdProp, onClose }) => { + {trace.plan && ( + + + + )} diff --git a/graphrag-ui/src/pages/setup/GraphRAGConfig.tsx b/graphrag-ui/src/pages/setup/GraphRAGConfig.tsx index 2228690..25be4a5 100644 --- a/graphrag-ui/src/pages/setup/GraphRAGConfig.tsx +++ b/graphrag-ui/src/pages/setup/GraphRAGConfig.tsx @@ -28,6 +28,7 @@ const GraphRAGConfig = () => { const [communityLevel, setCommunityLevel] = useState("2"); const [docOnly, setDocOnly] = useState(false); const [enableRouterFallback, setEnableRouterFallback] = useState(true); + const [agentMode, setAgentMode] = useState<"agentic" | "classic">("agentic"); // Collapsible section toggles (Configuration Scope and General Settings // are always shown). Advanced Ingestion stays collapsed by default — @@ -98,6 +99,7 @@ const GraphRAGConfig = () => { setCommunityLevel(String(graphragConfig.community_level ?? 2)); setDocOnly(graphragConfig.doc_only ?? false); setEnableRouterFallback(graphragConfig.enable_router_fallback ?? true); + setAgentMode(graphragConfig.agent_mode === "classic" ? "classic" : "agentic"); setLoadBatchSize(String(graphragConfig.load_batch_size ?? 500)); setUpsertDelay(String(graphragConfig.upsert_delay ?? 0)); setMaxConcurrency(String(graphragConfig.default_concurrency ?? 10)); @@ -251,6 +253,7 @@ const GraphRAGConfig = () => { community_level: parseInt(communityLevel), doc_only: docOnly, enable_router_fallback: enableRouterFallback, + agent_mode: agentMode, load_batch_size: parseInt(loadBatchSize), upsert_delay: parseInt(upsertDelay), default_concurrency: parseInt(maxConcurrency), @@ -280,6 +283,7 @@ const GraphRAGConfig = () => { community_level: 2, doc_only: false, enable_router_fallback: true, + agent_mode: "agentic", load_batch_size: 500, upsert_delay: 0, default_concurrency: 10, @@ -588,6 +592,27 @@ const GraphRAGConfig = () => { Fall back to vector search when structured-data retrieval fails.

            + +
            + + +

            + Agentic plans multi-step retrieval and combines structured and + document context to answer. Classic uses the original + single-lane router. Falls back to Classic automatically if the + chat model can't run the agentic engine. +

            +
            diff --git a/graphrag-ui/src/pages/setup/McpServersConfig.tsx b/graphrag-ui/src/pages/setup/McpServersConfig.tsx new file mode 100644 index 0000000..b321666 --- /dev/null +++ b/graphrag-ui/src/pages/setup/McpServersConfig.tsx @@ -0,0 +1,635 @@ +import React, { useEffect, useState, useCallback } from "react"; +import { Plus, Save, Loader2, Trash2, Pencil, PlugZap, Server, ChevronDown, ChevronRight } from "lucide-react"; +import { Input } from "@/components/ui/input"; +import { Button } from "@/components/ui/button"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import ConfigScopeToggle from "@/components/ConfigScopeToggle"; + +const MASKED_SECRET = "********"; + +type Transport = "stdio" | "http"; + +interface McpServer { + name: string; + transport: Transport; + enabled: boolean; + description: string; + purpose: string; + command: string; + args: string[]; + env: Record; + url: string; + headers: Record; + forward_user: boolean; + user_header: string; + allowed_tools: string[]; +} + +const emptyServer = (): McpServer => ({ + name: "", + transport: "stdio", + enabled: true, + description: "", + purpose: "", + command: "", + args: [], + env: {}, + url: "", + headers: {}, + forward_user: false, + user_header: "X-User", + allowed_tools: ["*"], +}); + +const fromApi = (raw: any): McpServer => ({ + ...emptyServer(), + ...raw, + args: Array.isArray(raw?.args) ? raw.args : [], + env: (raw?.env && typeof raw.env === "object") ? raw.env : {}, + headers: (raw?.headers && typeof raw.headers === "object") ? raw.headers : {}, + allowed_tools: Array.isArray(raw?.allowed_tools) && raw.allowed_tools.length > 0 + ? raw.allowed_tools : ["*"], +}); + +const isSpecComplete = (s: McpServer): boolean => { + if (!s.name.trim()) return false; + if (s.transport === "stdio") return s.command.trim().length > 0; + return s.url.trim().length > 0; +}; + +// ---- KvEditor / ListEditor / EditForm — extracted to module scope so +// they don't get re-created on every parent render (which would unmount + +// remount the inputs and make typing feel slow). + +const labelClass = "block text-sm font-medium mb-2 text-black dark:text-white"; +const helpClass = "text-xs text-gray-600 dark:text-[#D9D9D9] mt-1"; +const inputDark = "dark:border-[#3D3D3D] dark:bg-background"; + +interface KvEditorProps { + label: string; + value: Record; + onChange: (next: Record) => void; + hint?: string; +} + +const KvEditor: React.FC = ({ label, value, onChange, hint }) => { + const entries = Object.entries(value); + return ( +
            + + {hint &&

            {hint}

            } +
            + {entries.length === 0 && ( +

            (none)

            + )} + {entries.map(([k, v]) => ( +
            + { + const next: Record = {}; + for (const [kk, vv] of entries) { + next[kk === k ? e.target.value : kk] = vv; + } + onChange(next); + }} + placeholder="key" + className={`w-1/3 ${inputDark}`} + /> + onChange({ ...value, [k]: e.target.value })} + placeholder={v === MASKED_SECRET ? "(stored — leave to keep)" : "value"} + className={`flex-1 ${inputDark}`} + type={v === MASKED_SECRET ? "password" : "text"} + /> + +
            + ))} + +
            +
            + ); +}; + +interface ListEditorProps { + label: string; + value: string[]; + onChange: (next: string[]) => void; + placeholder?: string; +} + +const ListEditor: React.FC = ({ label, value, onChange, placeholder }) => ( +
            + + + onChange( + e.target.value + .split(",") + .map((s) => s.trim()) + .filter((s) => s.length > 0) + ) + } + placeholder={placeholder} + className={inputDark} + /> +
            +); + +interface EditFormProps { + server: McpServer; + onPatch: (patch: Partial) => void; + onClose: () => void; +} + +const EditForm: React.FC = ({ server: s, onPatch, onClose }) => { + return ( +
            +
            +
            + + onPatch({ name: e.target.value })} + placeholder="e.g. sales_tg" + className={inputDark} + /> +
            +
            + + +
            +
            + +
            + + onPatch({ description: e.target.value })} + placeholder="Short label shown in tool catalogs" + className={inputDark} + /> +
            + +
            + +