diff --git a/tutorials/44_Creating_Custom_SuperComponents.ipynb b/tutorials/44_Creating_Custom_SuperComponents.ipynb index 092a0ed..c3f27b0 100644 --- a/tutorials/44_Creating_Custom_SuperComponents.ipynb +++ b/tutorials/44_Creating_Custom_SuperComponents.ipynb @@ -47,17 +47,17 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { "id": "UQbU8GUfO-qZ" }, - "outputs": [], "source": [ "%%bash\n", "\n", "pip install haystack-ai\n", "pip install \"sentence-transformers>=4.1.0\" datasets \"transformers[torch,sentencepiece]<5\"" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -96,11 +96,9 @@ }, { "cell_type": "code", - "execution_count": 2, "metadata": { "id": "XvLVaFHTO-qb" }, - "outputs": [], "source": [ "from haystack import Document, Pipeline, super_component\n", "from haystack.components.joiners import DocumentJoiner\n", @@ -131,7 +129,9 @@ " self.pipeline.connect(\"text_embedder\", \"embedding_retriever\")\n", " self.pipeline.connect(\"bm25_retriever\", \"document_joiner\")\n", " self.pipeline.connect(\"embedding_retriever\", \"document_joiner\")" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -144,11 +144,13 @@ }, { "cell_type": "code", - "execution_count": null, "metadata": { - "id": "aNzUi4iz0FAU" + "id": "aNzUi4iz0FAU", + "ExecuteTime": { + "end_time": "2026-05-26T08:59:55.715295Z", + "start_time": "2026-05-26T08:59:25.735412Z" + } }, - "outputs": [], "source": [ "# Load a dataset\n", "dataset = load_dataset(\"HaystackBot/medrag-pubmed-chunk-with-embeddings\", split=\"train\")\n", @@ -162,18 +164,40 @@ "result = retriever.run(\n", " text=query, query=query\n", ") # `query` variable will match with `text` and `query` inputs of components in the pipeline." - ] + ], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating train split: 100%|██████████| 15377/15377 [00:00<00:00, 159479.28 examples/s]\n", + "The `tokenizer_kwargs` argument was renamed and is now deprecated. Please use `processor_kwargs` instead.\n", + "Batches: 100%|██████████| 1/1 [00:01<00:00, 1.04s/it]\n" + ] + } + ], + "execution_count": 3 }, { "cell_type": "code", - "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ZoTmeqV_j-OI", - "outputId": "7fa55a12-563c-4341-bbaa-3c578cc5e976" + "outputId": "7fa55a12-563c-4341-bbaa-3c578cc5e976", + "ExecuteTime": { + "end_time": "2026-05-26T08:59:55.778626Z", + "start_time": "2026-05-26T08:59:55.738878Z" + } }, + "source": [ + "# Print the results\n", + "print(f\"Found {len(result['documents'])} documents\")\n", + "for i, doc in enumerate(result[\"documents\"][:3]): # Show first 3 documents\n", + " print(f\"\\nDocument {i+1} (Score: {doc.score:.4f}):\")\n", + " print(doc.content[:200] + \"...\")" + ], "outputs": [ { "name": "stdout", @@ -192,13 +216,7 @@ ] } ], - "source": [ - "# Print the results\n", - "print(f\"Found {len(result['documents'])} documents\")\n", - "for i, doc in enumerate(result[\"documents\"][:3]): # Show first 3 documents\n", - " print(f\"\\nDocument {i+1} (Score: {doc.score:.4f}):\")\n", - " print(doc.content[:200] + \"...\")" - ] + "execution_count": 4 }, { "cell_type": "markdown", @@ -236,11 +254,13 @@ }, { "cell_type": "code", - "execution_count": 5, "metadata": { - "id": "INdC3WvLO-qb" + "id": "INdC3WvLO-qb", + "ExecuteTime": { + "end_time": "2026-05-26T08:59:55.806030Z", + "start_time": "2026-05-26T08:59:55.782346Z" + } }, - "outputs": [], "source": [ "from haystack import Document, Pipeline, super_component\n", "from haystack.components.joiners import DocumentJoiner\n", @@ -283,11 +303,12 @@ "\n", " # Define input mapping\n", " self.input_mapping = {\"query\": [\"text_embedder.text\", \"bm25_retriever.query\", \"ranker.query\"]}" - ] + ], + "outputs": [], + "execution_count": 5 }, { "cell_type": "code", - "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -373,106 +394,31 @@ ] }, "id": "yxaN3KBo65pv", - "outputId": "21486dba-7914-4349-a579-2993ec212d86" + "outputId": "21486dba-7914-4349-a579-2993ec212d86", + "ExecuteTime": { + "end_time": "2026-05-26T09:01:35.627813Z", + "start_time": "2026-05-26T08:59:55.807152Z" + } }, + "source": [ + "# Create and run the HybridRetrieverWithRanker\n", + "retriever = HybridRetrieverWithRanker(document_store)\n", + "result = retriever.run(query=query) # instead of retriever.run(text=query, query=query) thanks to input_mapping\n", + "\n", + "# Print the results\n", + "print(f\"Found {len(result['documents'])} documents\")\n", + "for i, doc in enumerate(result[\"documents\"][:3]): # Show first 3 documents\n", + " print(f\"\\nDocument {i+1} (Score: {doc.score:.4f}):\")\n", + " print(doc.content[:200] + \"...\")" + ], "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b4246cbc7b0b4f7784293fbd8337befe", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "config.json: 0%| | 0.00/799 [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;31m# Deserialize the component from the dictionary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mdeserialized\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcomponent_from_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDocumentPreprocessor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mserialized\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"document_preprocessor\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\nDeserialized component:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdeserialized\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/haystack/core/serialization.py\u001b[0m in \u001b[0;36mcomponent_from_dict\u001b[0;34m(cls, data, name, callbacks)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallbacks\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomponent_pre_init\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mdo_from_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 167\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0m_hook_component_init\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcomponent_pre_init_callback\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/haystack/core/serialization.py\u001b[0m in \u001b[0;36mdo_from_dict\u001b[0;34m()\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdo_from_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"from_dict\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 161\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdefault_from_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/haystack/components/preprocessors/document_preprocessor.py\u001b[0m in \u001b[0;36mfrom_dict\u001b[0;34m(cls, data)\u001b[0m\n\u001b[1;32m 188\u001b[0m \"\"\"\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"splitting_function\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"init_parameters\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 190\u001b[0;31m data[\"init_parameters\"][\"splitting_function\"] = deserialize_callable(\n\u001b[0m\u001b[1;32m 191\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"init_parameters\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"splitting_function\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 192\u001b[0m )\n", - "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/haystack/utils/callable_serialization.py\u001b[0m in \u001b[0;36mdeserialize_callable\u001b[0;34m(callable_handle)\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0mraises\u001b[0m \u001b[0mDeserializationError\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIf\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcallable\u001b[0m \u001b[0mcannot\u001b[0m \u001b[0mbe\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \"\"\"\n\u001b[0;32m---> 53\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcallable_handle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\".\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'split'" - ] + "outputId": "4fa8cd47-049b-4baa-bcb3-1b1f6f3e6ef8", + "ExecuteTime": { + "end_time": "2026-05-26T09:01:35.682709Z", + "start_time": "2026-05-26T09:01:35.651336Z" } - ], + }, "source": [ "from haystack.core.serialization import component_to_dict, component_from_dict\n", "from haystack.components.preprocessors import DocumentPreprocessor\n", @@ -589,7 +503,27 @@ "doc = Document(content=\"I love pizza!\")\n", "result = deserialized.run(documents=[doc])\n", "print(f\"\\nDeserialized component produced {len(result['documents'])} documents\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Serialized component:\n", + "{'type': 'haystack.components.preprocessors.document_preprocessor.DocumentPreprocessor', 'init_parameters': {'remove_empty_lines': True, 'remove_extra_whitespaces': True, 'remove_repeated_substrings': False, 'keep_id': False, 'remove_substrings': None, 'remove_regex': None, 'unicode_normalization': None, 'ascii_only': False, 'split_by': 'word', 'split_length': 250, 'split_overlap': 0, 'split_threshold': 0, 'splitting_function': None, 'respect_sentence_boundary': False, 'language': 'en', 'use_split_rules': True, 'extend_abbreviations': True}}\n", + "\n", + "Deserialized component:\n", + "\n", + "Inputs:\n", + " - documents: list[Document]\n", + "Outputs:\n", + " - documents: list[Document]\n", + "\n", + "Deserialized component produced 1 documents\n" + ] + } + ], + "execution_count": 7 }, { "cell_type": "markdown", @@ -619,11 +553,13 @@ }, { "cell_type": "code", - "execution_count": 8, "metadata": { - "id": "HryYZP9ZO-qc" + "id": "HryYZP9ZO-qc", + "ExecuteTime": { + "end_time": "2026-05-26T09:01:35.692681Z", + "start_time": "2026-05-26T09:01:35.685324Z" + } }, - "outputs": [], "source": [ "from haystack import Document, Pipeline, super_component\n", "from haystack.components.joiners import DocumentJoiner\n", @@ -673,11 +609,12 @@ " \"ranker.documents\": \"ranked_documents\",\n", " \"text_embedder.embedding\": \"query_embedding\",\n", " }" - ] + ], + "outputs": [], + "execution_count": 8 }, { "cell_type": "code", - "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -697,22 +634,45 @@ ] }, "id": "INdC3WvLO-qc", - "outputId": "a9d0a257-6969-49bd-f141-7d3d4fe24813" + "outputId": "a9d0a257-6969-49bd-f141-7d3d4fe24813", + "ExecuteTime": { + "end_time": "2026-05-26T09:01:39.167534Z", + "start_time": "2026-05-26T09:01:35.694539Z" + } }, + "source": [ + "# Create and run the AdvancedHybridRetriever\n", + "retriever = AdvancedHybridRetriever(document_store)\n", + "result = retriever.run(query=query)\n", + "\n", + "# Print the results\n", + "print(f\"BM25 documents: {len(result['bm25_documents'])}\")\n", + "print(f\"Embedding documents: {len(result['embedding_documents'])}\")\n", + "print(f\"Joined documents: {len(result['joined_documents'])}\")\n", + "print(f\"Ranked documents: {len(result['ranked_documents'])}\")\n", + "print(f\"Query embedding shape: {len(result['query_embedding'])}\")\n", + "\n", + "# Compare the top document from each stage\n", + "print(\"\\nTop BM25 document:\")\n", + "print(result[\"bm25_documents\"][0].content[:200] + \"...\")\n", + "print(f\"Score: {result['bm25_documents'][0].score:.4f}\")\n", + "\n", + "print(\"\\nTop embedding document:\")\n", + "print(result[\"embedding_documents\"][0].content[:200] + \"...\")\n", + "print(f\"Score: {result['embedding_documents'][0].score:.4f}\")\n", + "\n", + "print(\"\\nTop ranked document:\")\n", + "print(result[\"ranked_documents\"][0].content[:200] + \"...\")\n", + "print(f\"Score: {result['ranked_documents'][0].score:.4f}\")" + ], "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1c63ba6f43574419b66fe173a763b04a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Batches: 0%| | 0/1 [00:00