diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index e0b9e61a070..1c7ae4f3a85 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -63,18 +63,17 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         if not self.load_data_from_file:
             import numpy as np
 
-            self.metadata[file] = self.modality_type.create_metadata(
-                1000, np.array([0])
-            )
+            audio = np.array([0])
+            sr = 1000
         else:
             audio, sr = librosa.load(file, dtype=self._data_type)
 
             if self.normalize:
                 audio = librosa.util.normalize(audio)
 
-            self.metadata[file] = self.modality_type.create_metadata(sr, audio)
+        self.metadata.append(self.modality_type.create_metadata(sr, audio))
 
-            self.data.append(audio)
+        self.data.append(audio)
 
     def get_stats(self, source_path: str):
         sampling_rate = 0
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
index 88decd641fd..9b89c773942 100644
--- a/src/main/python/systemds/scuro/dataloader/base_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -44,9 +44,7 @@ def __init__(
         (otherwise please provide your own Dataloader that knows about the file name convention)
         """
         self.data = []
-        self.metadata = (
-            {}
-        )  # TODO: check what the index should be for storing the metadata (file_name, counter, ...)
+        self.metadata = []
         self.source_path = source_path
         self.indices = indices
         self.modality_type = modality_type
@@ -87,7 +85,7 @@ def data_type(self, data_type):
     def reset(self):
         self._next_chunk = 0
         self.data = []
-        self.metadata = {}
+        self.metadata = []
 
     def load(self):
         """
@@ -134,6 +132,7 @@ def _load_next_chunk(self):
         Loads the next chunk of data
         """
         self.data = []
+        # TODO: Handle metadata correctly
         next_chunk_indices = self.indices[
             self._next_chunk
             * self._chunk_size : (self._next_chunk + 1)
diff --git a/src/main/python/systemds/scuro/dataloader/image_loader.py b/src/main/python/systemds/scuro/dataloader/image_loader.py
index 498ae77a897..25e8690cf5a 100644
--- a/src/main/python/systemds/scuro/dataloader/image_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/image_loader.py
@@ -71,8 +71,8 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
 
         image = image.astype(np.uint8, copy=False)
 
-        self.metadata[file] = self.modality_type.create_metadata(
-            width, height, channels
+        self.metadata.append(
+            self.modality_type.create_metadata(width, height, channels)
         )
 
         self.data.append(image)
diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py
index f5ffd89ea38..adb3f6aaf64 100644
--- a/src/main/python/systemds/scuro/dataloader/json_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -69,7 +69,9 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
 
                 text = " ".join(text) if isinstance(text, list) else text
                 self.data.append(text)
-                self.metadata[idx] = self.modality_type.create_metadata(len(text), text)
+                self.metadata.append(
+                    self.modality_type.create_metadata(len(text), text) | json_file[idx]
+                )
 
     def get_stats(self, source_path: str):
         self.file_sanity_check(source_path)
diff --git a/src/main/python/systemds/scuro/dataloader/pdf_loader.py b/src/main/python/systemds/scuro/dataloader/pdf_loader.py
new file mode 100644
index 00000000000..add02e50457
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/pdf_loader.py
@@ -0,0 +1,70 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional, Union
+import pymupdf
+
+import numpy as np
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+import cv2
+from systemds.scuro.modality.type import ModalityType
+
+
+class PdfLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        data_type: Union[np.dtype, str] = np.float16,
+        chunk_size: Optional[int] = None,
+        load=True,
+        ext=".pdf",
+    ):
+        super().__init__(
+            source_path, indices, data_type, chunk_size, ModalityType.IMAGE, ext
+        )
+        self.load_data_from_file = load
+
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
+        self.file_sanity_check(file)
+
+        doc = pymupdf.open(file)
+
+        for i, page in enumerate(doc.pages()):
+            image_bytes = page.get_pixmap().tobytes("jpg")
+            np_buffer = np.frombuffer(image_bytes, dtype=np.uint8)
+
+            image = cv2.imdecode(np_buffer, cv2.IMREAD_COLOR)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+            if image.ndim == 2:
+                height, width = image.shape
+                channels = 1
+            else:
+                height, width, channels = image.shape
+
+            image = image.astype(np.uint8, copy=False)
+
+            self.metadata.append(
+                self.modality_type.create_metadata(width, height, channels)
+            )
+
+            self.data.append(image)
diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py
index 8b987f68453..f734a080b15 100644
--- a/src/main/python/systemds/scuro/dataloader/text_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -56,8 +56,8 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
                 if self.prefix:
                     line = re.sub(self.prefix, "", line)
                 line = line.replace("\n", "")
-                self.metadata[file] = self.modality_type.create_metadata(
-                    len(line.split()), line
+                self.metadata.append(
+                    self.modality_type.create_metadata(len(line.split()), line)
                 )
                 self.data.append(line)
 
diff --git a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
index 6e40e8eb08e..7131b55db11 100644
--- a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
@@ -81,15 +81,20 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             data = self._normalize_signals(data)
 
         if file:
-            self.metadata[index] = self.modality_type.create_metadata(
-                self.signal_names, data, self.sampling_rate
+            self.metadata.append(
+                self.modality_type.create_metadata(
+                    self.signal_names, data, self.sampling_rate
+                )
             )
+            self.data.append(data)
         else:
             for i, index in enumerate(self.indices):
-                self.metadata[str(index)] = self.modality_type.create_metadata(
-                    self.signal_names, data[i], self.sampling_rate
+                self.metadata.append(
+                    self.modality_type.create_metadata(
+                        self.signal_names, data[i], self.sampling_rate
+                    )
                 )
-        self.data.append(data)
+                self.data.append(data[i])
 
     def _normalize_signals(self, data: np.ndarray) -> np.ndarray:
         if data.ndim == 1:
diff --git a/src/main/python/systemds/scuro/dataloader/transcript_loader.py b/src/main/python/systemds/scuro/dataloader/transcript_loader.py
new file mode 100644
index 00000000000..67166f2c327
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/transcript_loader.py
@@ -0,0 +1,59 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional, Union
+from faster_whisper import WhisperModel
+import numpy as np
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.modality.type import ModalityType
+
+
+class TranscriptLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        data_type: Union[np.dtype, str] = np.float32,
+        chunk_size: Optional[int] = None,
+        normalize: bool = True,
+        transcribe_model_size: str = "medium",
+        load=True,
+    ):
+        super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT)
+        self.model = WhisperModel(
+            transcribe_model_size, device="cpu", compute_type="int8"
+        )
+        self.normalize = normalize
+        self.load_data_from_file = load
+
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
+        self.file_sanity_check(file)
+        segments, _ = self.model.transcribe(file, vad_filter=True)
+
+        for i, seg in enumerate(segments):
+            md = self.modality_type.create_metadata(len(seg.text.split()), seg.text)
+            md["timestamp_start"] = seg.start
+            md["timestamp_end"] = seg.end
+            md["text"] = seg.text
+
+            self.metadata.append(md)
+
+            self.data.append(seg.text)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index e57f685e031..a60b7acc60b 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -87,8 +87,10 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         num_channels = 3
 
-        self.metadata[file] = self.modality_type.create_metadata(
-            self.fps, length, width, height, num_channels
+        self.metadata.append(
+            self.modality_type.create_metadata(
+                self.fps, length, width, height, num_channels
+            )
         )
 
         frames = []
diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py
index 335d1959fdc..544285bb52f 100644
--- a/src/main/python/systemds/scuro/modality/joined.py
+++ b/src/main/python/systemds/scuro/modality/joined.py
@@ -77,7 +77,8 @@ def execute(self, starting_idx=0):
             )
 
         for i in range(start, end):
-            idx_1 = list(self.left_modality.metadata.values())[i + starting_idx][
+            left_meta_idx = i if self.chunk_left else i + starting_idx
+            idx_1 = self.left_modality.metadata[left_meta_idx][
                 self.condition.leftField
             ]
             if (
@@ -90,9 +91,7 @@ def execute(self, starting_idx=0):
             if self.chunk_left:
                 i = i + starting_idx
 
-            idx_2 = list(self.right_modality.metadata.values())[i][
-                self.condition.rightField
-            ]
+            idx_2 = self.right_modality.metadata[i][self.condition.rightField]
             self.joined_right.data.append([])
 
             c = 0
@@ -228,8 +227,8 @@ def _handle_chunked_execution(self, representation):
     def _apply_representation_chunked(
         self, left_modality, right_modality, chunk_right, representation
     ):
-        new_left = Modality(left_modality.modality_type, {})
-        new_right = Modality(right_modality.modality_type, {})
+        new_left = Modality(left_modality.modality_type)
+        new_right = Modality(right_modality.modality_type)
 
         for _ in left_modality.iter_raw_data_chunks(reset=True):
             if chunk_right:
@@ -246,11 +245,11 @@ def _apply_representation_chunked(
                 self.joined_right, representation
             )
             new_right.data.extend(right_transformed.data)
-            new_right.metadata.update(right_transformed.metadata)
+            new_right.metadata.extend(right_transformed.metadata)
 
             left_transformed = self._apply_representation(left_modality, representation)
             new_left.data.extend(left_transformed.data)
-            new_left.metadata.update(left_transformed.metadata)
+            new_left.metadata.extend(left_transformed.metadata)
 
         new_left.update_metadata()
         new_right.update_metadata()
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 1bc8180e199..477e4e45f35 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -33,7 +33,7 @@ def __init__(
         self,
         modalityType: ModalityType,
         modality_id=-1,
-        metadata={},
+        metadata=[],
         data_type=None,
         transform_time=0,
     ):
@@ -91,10 +91,10 @@ def update_metadata(self):
         ):
             return
 
-        for i, (md_k, md_v) in enumerate(self.metadata.items()):
+        for i, md_v in enumerate(self.metadata):
             md_v = selective_copy_metadata(md_v)
             updated_md = self.modality_type.update_metadata(md_v, self.data[i])
-            self.metadata[md_k] = updated_md
+            self.metadata[i] = updated_md
             if i == 0:
                 self.data_type = updated_md["data_layout"]["type"]
 
@@ -160,13 +160,10 @@ def pad(self, value=0, max_len=None):
                     if self.has_metadata():
                         attention_mask = np.zeros(maxlen, dtype=np.int8)
                         attention_mask[: len(data)] = 1
-                        md_key = list(self.metadata.keys())[i]
-                        if "attention_mask" in self.metadata[md_key]:
-                            self.metadata[md_key]["attention_mask"] = attention_mask
+                        if "attention_mask" in self.metadata[i]:
+                            self.metadata[i]["attention_mask"] = attention_mask
                         else:
-                            self.metadata[md_key].update(
-                                {"attention_mask": attention_mask}
-                            )
+                            self.metadata[i].update({"attention_mask": attention_mask})
             elif (
                 isinstance(first, list)
                 and len(first) > 0
@@ -190,13 +187,10 @@ def pad(self, value=0, max_len=None):
                     if self.has_metadata():
                         attention_mask = np.zeros(maxlen, dtype=np.int8)
                         attention_mask[: len(data)] = 1
-                        md_key = list(self.metadata.keys())[i]
-                        if "attention_mask" in self.metadata[md_key]:
-                            self.metadata[md_key]["attention_mask"] = attention_mask
+                        if "attention_mask" in self.metadata[i]:
+                            self.metadata[i]["attention_mask"] = attention_mask
                         else:
-                            self.metadata[md_key].update(
-                                {"attention_mask": attention_mask}
-                            )
+                            self.metadata[i].update({"attention_mask": attention_mask})
             else:
                 maxlen = (
                     max([len(seq) for seq in self.data]) if max_len is None else max_len
@@ -214,19 +208,16 @@ def pad(self, value=0, max_len=None):
                     if self.has_metadata():
                         attention_mask = np.zeros(result.shape[1], dtype=np.int8)
                         attention_mask[: len(data)] = 1
-                        md_key = list(self.metadata.keys())[i]
-                        if "attention_mask" in self.metadata[md_key]:
-                            self.metadata[md_key]["attention_mask"] = attention_mask
+                        if "attention_mask" in self.metadata[i]:
+                            self.metadata[i]["attention_mask"] = attention_mask
                         else:
-                            self.metadata[md_key].update(
-                                {"attention_mask": attention_mask}
-                            )
+                            self.metadata[i].update({"attention_mask": attention_mask})
         # TODO: this might need to be a new modality (otherwise we loose the original data)
         self.data = result
 
     def get_data_layout(self):
         if self.has_metadata():
-            return list(self.metadata.values())[0]["data_layout"]["representation"]
+            return self.metadata[0]["data_layout"]["representation"]
 
         return None
 
@@ -234,14 +225,14 @@ def has_data(self):
         return self.data is not None and len(self.data) != 0
 
     def has_metadata(self):
-        return self.metadata is not None and self.metadata != {}
+        return self.metadata is not None and len(self.metadata) != 0
 
     def is_aligned(self, other_modality):
         aligned = True
         for i in range(len(self.data)):
             if (
-                list(self.metadata.values())[i]["data_layout"]["shape"]
-                != list(other_modality.metadata.values())[i]["data_layout"]["shape"]
+                self.metadata[i]["data_layout"]["shape"]
+                != other_modality.metadata[i]["data_layout"]["shape"]
             ):
                 aligned = False
                 break
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index e185deb7c97..eaaa7a20322 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -66,9 +66,9 @@ def __init__(
         self.aggregate_dim = aggregate_dim
 
         if modality.__class__.__name__ == "UnimodalModality":
-            for k, v in self.metadata.items():
-                if "attention_masks" in v:
-                    del self.metadata[k]["attention_masks"]
+            for m in self.metadata:
+                if "attention_masks" in m:
+                    del m["attention_masks"]
 
     def copy_from_instance(self):
         """
@@ -82,8 +82,7 @@ def calculate_memory_usage(self):
             data_bytes += self._estimate_data_bytes(instance)
 
         md_bytes = 0
-        for key, value in self.metadata.items():
-            md_bytes += self._estimate_data_bytes(key)
+        for value in self.metadata:
             md_bytes += self._estimate_data_bytes(value)
 
         total_bytes = (
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index 9c883efec32..0493edf5bdd 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -212,7 +212,7 @@ def get_schema(self):
         return ModalitySchemas.get(self.name)
 
     def has_field(self, md, field):
-        for value in md.values():
+        for value in md:
             if field in value:
                 return True
             else:
@@ -221,7 +221,7 @@ def has_field(self, md, field):
 
     def get_field_for_instances(self, md, field):
         data = []
-        for items in md.values():
+        for items in md:
             data.append(self.get_field(items, field))
         return data
 
@@ -242,8 +242,8 @@ def add_field(self, md, field, data):
         return md
 
     def add_field_for_instances(self, md, field, data):
-        for key, value in zip(md.keys(), data):
-            md[key].update({field: value})
+        for i, value in enumerate(data):
+            md[i].update({field: value})
 
         return md
 
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 319599d6807..84204ac570d 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -41,7 +41,7 @@ def __init__(self, data_loader: BaseLoader):
         super().__init__(
             data_loader.modality_type,
             Identifier().new_id(),
-            {},
+            [],
             data_loader.data_type,
         )
         self.data_loader = data_loader
@@ -56,10 +56,11 @@ def copy_from_instance(self):
     def get_metadata_at_position(self, position: int):
         if self.data_loader.chunk_size:
             return self.metadata[
-                self.data_loader.chunk_size * self.data_loader.next_chunk + position
+                (self.data_loader.next_chunk - 1) * self.data_loader.chunk_size
+                + position
             ]
 
-        return self.metadata[self.dataIndex][position]
+        return self.metadata[position]
 
     def get_stats(self):
         return self.stats
@@ -165,7 +166,7 @@ def apply_representations(self, representations, aggregation=None):
                     ].data.extend(transformed_chunk.data)
                     transformed_modalities_per_representation[
                         representation.name
-                    ].metadata.update(transformed_chunk.metadata)
+                    ].metadata.extend(transformed_chunk.metadata)
                     for d in transformed_chunk.data:
                         original_lengths_per_representation[representation.name].append(
                             d.shape[0]
diff --git a/src/main/python/systemds/scuro/representations/ orb_alignment.py b/src/main/python/systemds/scuro/representations/ orb_alignment.py
new file mode 100644
index 00000000000..5594f3bc54d
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/ orb_alignment.py	
@@ -0,0 +1,51 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from systemds.scuro.representations.alignment import Alignment
+from dataclasses import dataclass
+import numpy as np
+import cv2 as cv
+
+
+@dataclass
+class OrbDescriptor:
+    kp: object
+    desc: object
+
+
+class OrbAlignment(Alignment):
+    def __init__(self):
+        self.orb = cv.ORB_create()
+        self.bfm = cv.BFMatcher(cv.NORM_HAMMING, crossCheck=True)
+        super().__init("OrbAlignment")
+
+    def compute_descriptor(self, segment):
+        return [OrbDescriptor(self.orb.detectAndCompute(segment, None))]
+
+    def compare(self, a, b):
+        if a.desc is None or b.desc is None:
+            return float("inf")
+        matches = bfm.match(a.desc, b.desc)
+        good_matches = [m for m in matches if m.distance < 40]
+
+        if len(good_matches) == 0:
+            return float(inf)
+
+        return np.median([m.distance for m in good_matches])
diff --git a/src/main/python/systemds/scuro/representations/alignment.py b/src/main/python/systemds/scuro/representations/alignment.py
new file mode 100644
index 00000000000..42e6df0b205
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/alignment.py
@@ -0,0 +1,136 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+from collections import defaultdict
+import copy
+
+
+@dataclass
+class Match:
+    primary: int
+    secondary: int
+    distance: float
+
+
+class Alignment(ABC):
+    def __init__(self, name):
+        self.name = name
+
+    def execute(self, primary_modality, secondary_modality):
+        primary_descriptor_collections = self._batch_compute_descriptors(
+            primary_modality
+        )
+        secondary_descriptor_collections = self._batch_compute_descriptors(
+            secondary_modality
+        )
+
+        matches = []
+
+        for p, p_collection in enumerate(primary_descriptor_collections):
+            stats = defaultdict(
+                lambda: {
+                    "count": 0,
+                    "total_distance": 0.0,
+                    "best_distance": float("inf"),
+                }
+            )
+
+            for p_desc in p_collection:
+                best_secondary = None
+                best_dist = float("inf")
+
+                for s, s_collection in enumerate(secondary_descriptor_collections):
+                    for s_desc in s_collection:
+                        dist = self.compare(p_desc, s_desc)
+
+                        if dist < best_dist:
+                            best_dist = dist
+                            best_secondary = s
+
+                if best_secondary is not None:
+                    stats[best_secondary]["count"] += 1
+                    stats[best_secondary]["total_distance"] += best_dist
+                    stats[best_secondary]["best_distance"] = min(
+                        stats[best_secondary]["best_distance"], best_dist
+                    )
+
+            if not stats:
+                matches.append(Match(p, None, float("inf")))
+                continue
+
+            best_match = min(
+                stats.items(),
+                key=lambda item: (
+                    -item[1]["count"],  # mehr Votes ist besser
+                    item[1]["total_distance"],  # kleinere Gesamtdistanz ist besser
+                    item[1]["best_distance"],  # optional weiterer Tie-Breaker
+                ),
+            )[0]
+
+            result_distance = (
+                stats[best_match]["total_distance"] / stats[best_match]["count"]
+            )
+
+            matches.append(Match(p, best_match, result_distance))
+
+        return matches
+
+    @staticmethod
+    def apply_matching(alignment, secondary_modality):
+        aligned_modality = copy.deepcopy(secondary_modality)
+        aligned_modality.data = [None] * len(alignment)
+        aligned_modality.metadata = [None] * len(alignment)
+
+        for match in alignment:
+            aligned_modality.data[match.primary] = secondary_modality.data[
+                match.secondary
+            ]
+            aligned_modality.metadata[match.primary] = secondary_modality.metadata[
+                match.secondary
+            ]
+
+        return aligned_modality
+
+    def _batch_compute_descriptors(self, modality):
+        descriptors = []
+
+        if modality.data_loader.chunk_size:
+            modality.data_loader.reset()
+            while modality.data_loader.next_chunk < modality.data_loader.num_chunks:
+                modality.extract_raw_data()
+                for d in modality.data:
+                    descriptors.append(self.compute_descriptor(d))
+        else:
+            if not modality.has_data():
+                modality.extract_raw_data()
+                for d in modality.data:
+                    descriptors.append(self.compute_descriptor(d))
+
+        return descriptors
+
+    @abstractmethod
+    def compute_descriptor(self, segment):
+        pass
+
+    @abstractmethod
+    def compare(self, a, b):
+        pass
diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py
index b13d2dfeb85..518cc1eb5dc 100644
--- a/src/main/python/systemds/scuro/representations/clip.py
+++ b/src/main/python/systemds/scuro/representations/clip.py
@@ -243,6 +243,9 @@ def create_visual_embeddings(self, modality):
                 with torch.no_grad():
                     output = self.model.get_image_features(**inputs)
 
+                if hasattr(output, "pooler_output"):
+                    output = output.pooler_output
+
                 if len(output.shape) > 2:
                     output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1))
 
diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py
index a7ca905f477..5d53690317e 100644
--- a/src/main/python/systemds/scuro/representations/concatenation.py
+++ b/src/main/python/systemds/scuro/representations/concatenation.py
@@ -44,9 +44,7 @@ def execute(self, modalities: List[Modality]):
         if len(modalities) == 1:
             return np.asarray(
                 modalities[0].data,
-                dtype=modalities[0].metadata[list(modalities[0].metadata.keys())[0]][
-                    "data_layout"
-                ]["type"],
+                dtype=modalities[0].metadata[0]["data_layout"]["type"],
             )
 
         max_emb_size = self.get_max_embedding_size(modalities)
@@ -64,9 +62,7 @@ def execute(self, modalities: List[Modality]):
                     data,
                     np.asarray(
                         other_modality,
-                        dtype=modality.metadata[list(modality.metadata.keys())[0]][
-                            "data_layout"
-                        ]["type"],
+                        dtype=modality.metadata[0]["data_layout"]["type"],
                     ),
                 ],
                 axis=-1,
diff --git a/src/main/python/systemds/scuro/representations/contrastive_learning.py b/src/main/python/systemds/scuro/representations/contrastive_learning.py
new file mode 100644
index 00000000000..3599079e6f7
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/contrastive_learning.py
@@ -0,0 +1,81 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import copy
+
+
+class ContrasitveLearning:
+    @staticmethod
+    def execute(
+        input_first_modality,
+        input_second_modality,
+        input_first_extensions,
+        input_second_extensions,
+        metadata_matching_function,
+    ):
+        # Add check for same dimensionality of input modlities and extensions
+
+        def empty_modality_copy(input_modality):
+            modality = copy.deepcopy(input_modality)
+            if isinstance(modality, list):
+                for m in modality:
+                    m.data = []
+                    m.metadata = []
+            else:
+                modality.data = []
+                modality.metadata = []
+
+            return modality
+
+        first_modality = empty_modality_copy(input_first_modality)
+        second_modality = empty_modality_copy(input_second_modality)
+        first_extensions = empty_modality_copy(input_first_extensions)
+        second_extensions = empty_modality_copy(input_second_extensions)
+
+        labels = []
+
+        for i in range(len(input_first_modality.data)):
+            for j in range(len(input_second_modality.data)):
+                first_modality.data.append(input_first_modality.data[i])
+                first_modality.metadata.append(input_first_modality.metadata[i])
+
+                for m, input_m in zip(first_extensions, input_first_extensions):
+                    m.data.append(input_m.data[i])
+                    m.metadata.append(input_m.metadata[i])
+
+                second_modality.data.append(input_second_modality.data[j])
+                second_modality.metadata.append(input_second_modality.metadata[j])
+
+                for m, input_m in zip(second_extensions, input_second_extensions):
+                    m.data.append(input_m.data[j])
+                    m.metadata.append(input_m.metadata[j])
+
+                if metadata_matching_function(
+                    input_first_modality.metadata[i], input_second_modality.metadata[j]
+                ):
+                    labels.append(True)
+                else:
+                    labels.append(False)
+
+        return (
+            [first_modality] + first_extensions,
+            [second_modality] + second_extensions,
+            labels,
+        )
diff --git a/src/main/python/systemds/scuro/representations/covarep_audio_features.py b/src/main/python/systemds/scuro/representations/covarep_audio_features.py
index 28dd6fc2978..01098ef4ee1 100644
--- a/src/main/python/systemds/scuro/representations/covarep_audio_features.py
+++ b/src/main/python/systemds/scuro/representations/covarep_audio_features.py
@@ -47,7 +47,7 @@ def transform(self, modality, aggregation=None):
         )
         result = []
         for i, y in enumerate(modality.data):
-            sr = list(modality.metadata.values())[i]["frequency"]
+            sr = modality.metadata[i]["frequency"]
 
             spectral_centroid = librosa.feature.spectral_centroid(
                 y=y, sr=sr, hop_length=self.hop_length
@@ -222,7 +222,7 @@ def transform(self, modality, aggregation=None):
         )
         result = []
         for i, y in enumerate(modality.data):
-            sr = list(modality.metadata.values())[i]["frequency"]
+            sr = modality.metadata[i]["frequency"]
 
             pitches, magnitudes = librosa.piptrack(
                 y=y, sr=sr, hop_length=self.hop_length
diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py
index e067e2ccf95..104f2727e69 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -92,7 +92,7 @@ def _prepare_data(self, modalities: List[Modality]) -> np.ndarray:
                 data = np.array(modality.data)
             except:
                 max_len = -1
-                for md in modality.metadata.values():
+                for md in modality.metadata:
                     if max_len < md["data_layout"]["shape"][0]:
                         max_len = md["data_layout"]["shape"][0]
                 data = np.zeros((len(modality.data), max_len))
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 67ff955a98f..46e5045b2eb 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -57,10 +57,9 @@ def transform(self, modality, aggregation=None):
             modality, self, self.output_modality_type
         )
         result = []
-        metadata_values = list(modality.metadata.values())
 
         for i, sample in enumerate(modality.data):
-            sr = metadata_values[i]["frequency"]
+            sr = modality.metadata[i]["frequency"]
             computed_feature = self.compute_feature(sample, sr)
             result.append(computed_feature)
 
diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py
index 5e5b54b0795..737a3dffe95 100644
--- a/src/main/python/systemds/scuro/representations/mfcc.py
+++ b/src/main/python/systemds/scuro/representations/mfcc.py
@@ -63,7 +63,7 @@ def transform(self, modality, aggregation=None):
         result = []
 
         for i, sample in enumerate(modality.data):
-            sr = list(modality.metadata.values())[i]["frequency"]
+            sr = modality.metadata[i]["frequency"]
             computed_feature = self.compute_feature(sample, sr)
             result.append(computed_feature)
 
diff --git a/src/main/python/systemds/scuro/representations/pHash_alignment.py b/src/main/python/systemds/scuro/representations/pHash_alignment.py
new file mode 100644
index 00000000000..f70a12a770f
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/pHash_alignment.py
@@ -0,0 +1,43 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from systemds.scuro.representations.alignment import Alignment
+import cv2 as cv
+import numpy as np
+
+
+class PHashAlignment(Alignment):
+    def __init__(self):
+        super().__init__("pHashAlignment")
+        self.hasher = cv.img_hash.PHash_create()
+
+    def compute_descriptor(self, segment):
+        if segment.ndim == 3:
+            return [self.hasher.compute(segment)]
+        if segment.ndim == 4:  # For videos
+            descriptors = []
+            for s in segment:
+                frame = (s * 255).astype(np.uint8, copy=True)
+                descriptors.append(self.hasher.compute(frame))
+            return descriptors
+        raise ("PHashAlignment is only implemented for ndim=3 or ndim=4")
+
+    def compare(self, a, b):
+        return self.hasher.compare(a, b)
diff --git a/src/main/python/systemds/scuro/representations/sum.py b/src/main/python/systemds/scuro/representations/sum.py
index e6187b4050d..d6c4fe659b2 100644
--- a/src/main/python/systemds/scuro/representations/sum.py
+++ b/src/main/python/systemds/scuro/representations/sum.py
@@ -43,17 +43,13 @@ def __init__(self, params=None):
     def execute(self, modalities: List[Modality]):
         data = np.asarray(
             modalities[0].data,
-            dtype=modalities[0].metadata[list(modalities[0].metadata.keys())[0]][
-                "data_layout"
-            ]["type"],
+            dtype=modalities[0].metadata[0]["data_layout"]["type"],
         )
 
         for m in range(1, len(modalities)):
             data += np.asarray(
                 modalities[m].data,
-                dtype=modalities[m].metadata[list(modalities[m].metadata.keys())[0]][
-                    "data_layout"
-                ]["type"],
+                dtype=modalities[m].metadata[0]["data_layout"]["type"],
             )
         return data
 
diff --git a/src/main/python/systemds/scuro/representations/text_context_with_indices.py b/src/main/python/systemds/scuro/representations/text_context_with_indices.py
index 095e9f81063..1a341af1e3a 100644
--- a/src/main/python/systemds/scuro/representations/text_context_with_indices.py
+++ b/src/main/python/systemds/scuro/representations/text_context_with_indices.py
@@ -208,7 +208,7 @@ def execute(self, modality):
             List of lists, where each inner list contains text chunks (strings)
         """
 
-        for instance, metadata in zip(modality.data, modality.metadata.values()):
+        for instance, metadata in zip(modality.data, modality.metadata):
             text = _extract_text(instance)
             if not text:
                 ModalityType.TEXT.add_field(metadata, "text_spans", [(0, 0)])
@@ -344,7 +344,7 @@ def execute(self, modality):
             List of tuples, where each tuple contains start and end index to the text chunks
         """
 
-        for instance, metadata in zip(modality.data, modality.metadata.values()):
+        for instance, metadata in zip(modality.data, modality.metadata):
             text = _extract_text(instance)
             if not text:
                 ModalityType.TEXT.add_field(metadata, "text_spans", [(0, 0)])
diff --git a/src/main/python/systemds/scuro/representations/timeseries_representations.py b/src/main/python/systemds/scuro/representations/timeseries_representations.py
index bb4ea4f49cb..80f6880a0b8 100644
--- a/src/main/python/systemds/scuro/representations/timeseries_representations.py
+++ b/src/main/python/systemds/scuro/representations/timeseries_representations.py
@@ -52,7 +52,7 @@ def transform(self, modality, aggregation=None):
             result.append(feature)
 
         transformed_modality.data = np.vstack(np.array(result)).astype(
-            modality.metadata[list(modality.metadata.keys())[0]]["data_layout"]["type"]
+            modality.metadata[0]["data_layout"]["type"]
         )
         return transformed_modality
 
diff --git a/src/main/python/systemds/scuro/representations/wav2vec.py b/src/main/python/systemds/scuro/representations/wav2vec.py
index c599550be19..38dcb848436 100644
--- a/src/main/python/systemds/scuro/representations/wav2vec.py
+++ b/src/main/python/systemds/scuro/representations/wav2vec.py
@@ -52,7 +52,7 @@ def transform(self, modality, aggregation=None):
 
         result = []
         for i, sample in enumerate(modality.data):
-            sr = list(modality.metadata.values())[i]["frequency"]
+            sr = modality.metadata[i]["frequency"]
             audio_resampled = librosa.resample(
                 np.array(sample), orig_sr=sr, target_sr=16000
             )
diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py
index c9622f1a2ad..a34b6ebe4c8 100644
--- a/src/main/python/systemds/scuro/representations/window_aggregation.py
+++ b/src/main/python/systemds/scuro/representations/window_aggregation.py
@@ -224,7 +224,7 @@ def execute(self, modality):
             )
 
             windowed_data = np.array(padded_features)
-            data_type = list(modality.metadata.values())[0]["data_layout"]["type"]
+            data_type = modality.metadata[0]["data_layout"]["type"]
             if data_type != "str":
                 windowed_data = windowed_data.astype(data_type)
 
diff --git a/src/main/python/systemds/scuro/utils/schema_helpers.py b/src/main/python/systemds/scuro/utils/schema_helpers.py
index 3d1fbf4d71a..929e9c7f4f3 100644
--- a/src/main/python/systemds/scuro/utils/schema_helpers.py
+++ b/src/main/python/systemds/scuro/utils/schema_helpers.py
@@ -43,4 +43,4 @@ def calculate_new_frequency(new_length, old_length, old_frequency):
 
 
 def get_shape(metadata):
-    return len(list(metadata.values())[0]["data_layout"]["shape"])
+    return len(metadata[0]["data_layout"]["shape"])
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index b78ea314833..a0b43fc8593 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -51,6 +51,7 @@ class TestDataLoader(BaseLoader):
     def __init__(self, indices, chunk_size, modality_type, data, data_type, metadata):
         super().__init__("", indices, data_type, chunk_size, modality_type)
 
+        self._full_metadata = metadata
         self.metadata = metadata
         self.test_data = data
         if modality_type == ModalityType.TEXT:
@@ -110,8 +111,10 @@ def reset(self):
     def extract(self, file, indices):
         if isinstance(self.test_data, list):
             self.data = [self.test_data[i] for i in indices]
+            self.metadata = [self._full_metadata[i] for i in indices]
         else:
             self.data = self.test_data[indices]
+            self.metadata = [self._full_metadata[i] for i in indices]
 
 
 class ModalityRandomDataGenerator:
@@ -120,7 +123,7 @@ def __init__(self):
         np.random.seed(4)
         self.modality_id = 0
         self.modality_type = None
-        self.metadata = {}
+        self.metadata = []
         self.data_type = np.float32
         self.transform_time = 0
         self.stats = None
@@ -186,21 +189,22 @@ def create1DModality(
 
         # TODO: write a dummy method to create the same metadata for all instances to avoid the for loop
         self.modality_type = modality_type
+        self.metadata = []
         for i in range(num_instances):
             if modality_type == ModalityType.AUDIO:
-                self.metadata[i] = modality_type.create_metadata(
+                self.metadata.append(modality_type.create_metadata(
                     num_features / 10, data[i]
-                )
+                ))
             elif modality_type == ModalityType.TEXT:
-                self.metadata[i] = modality_type.create_metadata(
+                self.metadata.append(modality_type.create_metadata(
                     num_features / 10, data[i]
-                )
+                ))
             elif modality_type == ModalityType.VIDEO:
-                self.metadata[i] = modality_type.create_metadata(
+                self.metadata.append(modality_type.create_metadata(
                     num_features / 30, 10, 0, 0, 1
-                )
+                ))
             elif modality_type == ModalityType.TIMESERIES:
-                self.metadata[i] = modality_type.create_metadata(["test"], data[i])
+                self.metadata.append(modality_type.create_metadata(["test"], data[i]))
             else:
                 raise NotImplementedError
 
@@ -222,10 +226,10 @@ def create_audio_data(self, num_instances, max_audio_length):
         for i in range(num_instances):
             data[i] = np.array(data[i]).astype(self.data_type)
 
-        self.metadata = {
-            i: self.modality_type.create_metadata(16000, np.array(data[i]))
+        self.metadata = [
+            self.modality_type.create_metadata(16000, np.array(data[i]))
             for i in range(num_instances)
-        }
+        ]
 
         return data, self.metadata
 
@@ -237,12 +241,12 @@ def create_timeseries_data(self, num_instances, sequence_length, num_features=1)
         ]
         if num_features == 1:
             data = [d.squeeze(-1) for d in data]
-        self.metadata = {
-            i: self.modality_type.create_metadata(
+        self.metadata = [
+            self.modality_type.create_metadata(
                 [f"feature_{j}" for j in range(num_features)], data[i]
             )
             for i in range(num_instances)
-        }
+        ]
         return data, self.metadata
 
     def create_text_data(self, num_instances, num_sentences_per_instance=1):
@@ -308,10 +312,10 @@ def create_text_data(self, num_instances, num_sentences_per_instance=1):
                 sentence += f" {verb} {obj}{punct}"
             sentences.append(sentence)
 
-        self.metadata = {
-            i: self.modality_type.create_metadata(len(sentences[i]), sentences[i])
+        self.metadata = [
+            self.modality_type.create_metadata(len(sentences[i]), sentences[i])
             for i in range(num_instances)
-        }
+        ]
 
         return sentences, self.metadata
 
@@ -321,9 +325,9 @@ def create_3d_modality(self, num_instances, dims=(100, 28, 28)):
             np.random.rand(dims[0], dims[1], dims[2]).astype(self.data_type)
             for _ in range(num_instances)
         ]
-        self.metadata = {
-            i: self.modality_type.create_metadata(data[i]) for i in range(num_instances)
-        }
+        self.metadata = [
+            self.modality_type.create_metadata(data[i]) for i in range(num_instances)
+        ]
         return data, self.metadata
 
     def create_2d_modality(self, num_instances, dims=(100, 28)):
@@ -332,9 +336,9 @@ def create_2d_modality(self, num_instances, dims=(100, 28)):
             np.random.rand(dims[0], dims[1]).astype(self.data_type)
             for _ in range(num_instances)
         ]
-        self.metadata = {
-            i: self.modality_type.create_metadata(data[i]) for i in range(num_instances)
-        }
+        self.metadata = [
+            self.modality_type.create_metadata(data[i]) for i in range(num_instances)
+        ]
         return data, self.metadata
 
     def create_visual_modality(
@@ -356,12 +360,12 @@ def create_visual_modality(
                 for _ in range(num_instances)
             ]
 
-            self.metadata = {
-                i: self.modality_type.create_metadata(
+            self.metadata = [
+                self.modality_type.create_metadata(
                     30, data[i].shape[0], width, height, color_channels
                 )
                 for i in range(num_instances)
-            }
+            ]
         else:
             self.modality_type = ModalityType.IMAGE
             data = [
@@ -373,10 +377,10 @@ def create_visual_modality(
                 )
                 for _ in range(num_instances)
             ]
-            self.metadata = {
-                i: self.modality_type.create_metadata(width, height, color_channels)
+            self.metadata = [
+                self.modality_type.create_metadata(width, height, color_channels)
                 for i in range(num_instances)
-            }
+            ]
 
         return data, self.metadata
 
diff --git a/src/main/python/tests/scuro/test_text_context_operators.py b/src/main/python/tests/scuro/test_text_context_operators.py
index ffa702b7c82..0b8ad36c160 100644
--- a/src/main/python/tests/scuro/test_text_context_operators.py
+++ b/src/main/python/tests/scuro/test_text_context_operators.py
@@ -83,7 +83,7 @@ def test_sentence_boundary_split_indices(self):
         sentence_boundary_split = SentenceBoundarySplitIndices(10, min_words=4)
         sentence_boundary_split.execute(self.text_modality)
         for instance, md in zip(
-            self.text_modality.data, self.text_modality.metadata.values()
+            self.text_modality.data, self.text_modality.metadata
         ):
             for chunk in md["text_spans"]:
                 text = instance[chunk[0] : chunk[1]].split(" ")
@@ -95,7 +95,7 @@ def test_overlapping_split_indices(self):
         overlapping_split = OverlappingSplitIndices(40, 0.1)
         overlapping_split.execute(self.text_modality)
         for instance, md in zip(
-            self.text_modality.data, self.text_modality.metadata.values()
+            self.text_modality.data, self.text_modality.metadata
         ):
             prev_chunk = (0, 0)
             for j, chunk in enumerate(md["text_spans"]):