apache · b-enedict · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 23, 2026
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -63,18 +63,17 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         if not self.load_data_from_file:
             import numpy as np
 
-            self.metadata[file] = self.modality_type.create_metadata(
-                1000, np.array([0])
-            )
+            audio = np.array([0])
+            sr = 1000
         else:
             audio, sr = librosa.load(file, dtype=self._data_type)
 
             if self.normalize:
                 audio = librosa.util.normalize(audio)
 
-            self.metadata[file] = self.modality_type.create_metadata(sr, audio)
+        self.metadata.append(self.modality_type.create_metadata(sr, audio))
 
-            self.data.append(audio)
+        self.data.append(audio)
 
     def get_stats(self, source_path: str):
         sampling_rate = 0

diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -44,9 +44,7 @@ def __init__(
         (otherwise please provide your own Dataloader that knows about the file name convention)
         """
         self.data = []
-        self.metadata = (
-            {}
-        )  # TODO: check what the index should be for storing the metadata (file_name, counter, ...)
+        self.metadata = []
         self.source_path = source_path
         self.indices = indices
         self.modality_type = modality_type
@@ -87,7 +85,7 @@ def data_type(self, data_type):
     def reset(self):
         self._next_chunk = 0
         self.data = []
-        self.metadata = {}
+        self.metadata = []
 
     def load(self):
         """
@@ -134,6 +132,7 @@ def _load_next_chunk(self):
         Loads the next chunk of data
         """
         self.data = []
+        # TODO: Handle metadata correctly
         next_chunk_indices = self.indices[
             self._next_chunk
             * self._chunk_size : (self._next_chunk + 1)

diff --git a/src/main/python/systemds/scuro/dataloader/image_loader.py b/src/main/python/systemds/scuro/dataloader/image_loader.py
@@ -71,8 +71,8 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
 
         image = image.astype(np.uint8, copy=False)
 
-        self.metadata[file] = self.modality_type.create_metadata(
-            width, height, channels
+        self.metadata.append(
+            self.modality_type.create_metadata(width, height, channels)
         )
 
         self.data.append(image)

diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -69,7 +69,9 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
 
                 text = " ".join(text) if isinstance(text, list) else text
                 self.data.append(text)
-                self.metadata[idx] = self.modality_type.create_metadata(len(text), text)
+                self.metadata.append(
+                    self.modality_type.create_metadata(len(text), text) | json_file[idx]
+                )
 
     def get_stats(self, source_path: str):
         self.file_sanity_check(source_path)

diff --git a/src/main/python/systemds/scuro/dataloader/pdf_loader.py b/src/main/python/systemds/scuro/dataloader/pdf_loader.py
@@ -0,0 +1,70 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional, Union
+import pymupdf
+
+import numpy as np
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+import cv2
+from systemds.scuro.modality.type import ModalityType
+
+
+class PdfLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        data_type: Union[np.dtype, str] = np.float16,
+        chunk_size: Optional[int] = None,
+        load=True,
+        ext=".pdf",
+    ):
+        super().__init__(
+            source_path, indices, data_type, chunk_size, ModalityType.IMAGE, ext
+        )
+        self.load_data_from_file = load
+
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
+        self.file_sanity_check(file)
+
+        doc = pymupdf.open(file)
+
+        for i, page in enumerate(doc.pages()):
+            image_bytes = page.get_pixmap().tobytes("jpg")
+            np_buffer = np.frombuffer(image_bytes, dtype=np.uint8)
+
+            image = cv2.imdecode(np_buffer, cv2.IMREAD_COLOR)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+            if image.ndim == 2:
+                height, width = image.shape
+                channels = 1
+            else:
+                height, width, channels = image.shape
+
+            image = image.astype(np.uint8, copy=False)
+
+            self.metadata.append(
+                self.modality_type.create_metadata(width, height, channels)
+            )
+
+            self.data.append(image)
diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -56,8 +56,8 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
                 if self.prefix:
                     line = re.sub(self.prefix, "", line)
                 line = line.replace("\n", "")
-                self.metadata[file] = self.modality_type.create_metadata(
-                    len(line.split()), line
+                self.metadata.append(
+                    self.modality_type.create_metadata(len(line.split()), line)
                 )
                 self.data.append(line)
 

diff --git a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
@@ -81,15 +81,20 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             data = self._normalize_signals(data)
 
         if file:
-            self.metadata[index] = self.modality_type.create_metadata(
-                self.signal_names, data, self.sampling_rate
+            self.metadata.append(
+                self.modality_type.create_metadata(
+                    self.signal_names, data, self.sampling_rate
+                )
             )
+            self.data.append(data)
         else:
             for i, index in enumerate(self.indices):
-                self.metadata[str(index)] = self.modality_type.create_metadata(
-                    self.signal_names, data[i], self.sampling_rate
+                self.metadata.append(
+                    self.modality_type.create_metadata(
+                        self.signal_names, data[i], self.sampling_rate
+                    )
                 )
-        self.data.append(data)
+                self.data.append(data[i])
 
     def _normalize_signals(self, data: np.ndarray) -> np.ndarray:
         if data.ndim == 1:

diff --git a/src/main/python/systemds/scuro/dataloader/transcript_loader.py b/src/main/python/systemds/scuro/dataloader/transcript_loader.py
@@ -0,0 +1,59 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional, Union
+from faster_whisper import WhisperModel
+import numpy as np
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.modality.type import ModalityType
+
+
+class TranscriptLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        data_type: Union[np.dtype, str] = np.float32,
+        chunk_size: Optional[int] = None,
+        normalize: bool = True,
+        transcribe_model_size: str = "medium",
+        load=True,
+    ):
+        super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT)
+        self.model = WhisperModel(
+            transcribe_model_size, device="cpu", compute_type="int8"
+        )
+        self.normalize = normalize
+        self.load_data_from_file = load
+
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
+        self.file_sanity_check(file)
+        segments, _ = self.model.transcribe(file, vad_filter=True)
+
+        for i, seg in enumerate(segments):
+            md = self.modality_type.create_metadata(len(seg.text.split()), seg.text)
+            md["timestamp_start"] = seg.start
+            md["timestamp_end"] = seg.end
+            md["text"] = seg.text
+
+            self.metadata.append(md)
+
+            self.data.append(seg.text)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -87,8 +87,10 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         num_channels = 3
 
-        self.metadata[file] = self.modality_type.create_metadata(
-            self.fps, length, width, height, num_channels
+        self.metadata.append(
+            self.modality_type.create_metadata(
+                self.fps, length, width, height, num_channels
+            )
         )
 
         frames = []

diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py
@@ -77,7 +77,8 @@ def execute(self, starting_idx=0):
             )
 
         for i in range(start, end):
-            idx_1 = list(self.left_modality.metadata.values())[i + starting_idx][
+            left_meta_idx = i if self.chunk_left else i + starting_idx
+            idx_1 = self.left_modality.metadata[left_meta_idx][
                 self.condition.leftField
             ]
             if (
@@ -90,9 +91,7 @@ def execute(self, starting_idx=0):
             if self.chunk_left:
                 i = i + starting_idx
 
-            idx_2 = list(self.right_modality.metadata.values())[i][
-                self.condition.rightField
-            ]
+            idx_2 = self.right_modality.metadata[i][self.condition.rightField]
             self.joined_right.data.append([])
 
             c = 0
@@ -228,8 +227,8 @@ def _handle_chunked_execution(self, representation):
     def _apply_representation_chunked(
         self, left_modality, right_modality, chunk_right, representation
     ):
-        new_left = Modality(left_modality.modality_type, {})
-        new_right = Modality(right_modality.modality_type, {})
+        new_left = Modality(left_modality.modality_type)
+        new_right = Modality(right_modality.modality_type)
 
         for _ in left_modality.iter_raw_data_chunks(reset=True):
             if chunk_right:
@@ -246,11 +245,11 @@ def _apply_representation_chunked(
                 self.joined_right, representation
             )
             new_right.data.extend(right_transformed.data)
-            new_right.metadata.update(right_transformed.metadata)
+            new_right.metadata.extend(right_transformed.metadata)
 
             left_transformed = self._apply_representation(left_modality, representation)
             new_left.data.extend(left_transformed.data)
-            new_left.metadata.update(left_transformed.metadata)
+            new_left.metadata.extend(left_transformed.metadata)
 
         new_left.update_metadata()
         new_right.update_metadata()

diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
@@ -33,7 +33,7 @@ def __init__(
         self,
         modalityType: ModalityType,
         modality_id=-1,
-        metadata={},
+        metadata=[],
         data_type=None,
         transform_time=0,
     ):
@@ -91,10 +91,10 @@ def update_metadata(self):
         ):
             return
 
-        for i, (md_k, md_v) in enumerate(self.metadata.items()):
+        for i, md_v in enumerate(self.metadata):
             md_v = selective_copy_metadata(md_v)
             updated_md = self.modality_type.update_metadata(md_v, self.data[i])
-            self.metadata[md_k] = updated_md
+            self.metadata[i] = updated_md
             if i == 0:
                 self.data_type = updated_md["data_layout"]["type"]
 
@@ -160,13 +160,10 @@ def pad(self, value=0, max_len=None):
                     if self.has_metadata():
                         attention_mask = np.zeros(maxlen, dtype=np.int8)
                         attention_mask[: len(data)] = 1
-                        md_key = list(self.metadata.keys())[i]
-                        if "attention_mask" in self.metadata[md_key]:
-                            self.metadata[md_key]["attention_mask"] = attention_mask
+                        if "attention_mask" in self.metadata[i]:
+                            self.metadata[i]["attention_mask"] = attention_mask
                         else:
-                            self.metadata[md_key].update(
-                                {"attention_mask": attention_mask}
-                            )
+                            self.metadata[i].update({"attention_mask": attention_mask})
             elif (
                 isinstance(first, list)
                 and len(first) > 0
@@ -190,13 +187,10 @@ def pad(self, value=0, max_len=None):
                     if self.has_metadata():
                         attention_mask = np.zeros(maxlen, dtype=np.int8)
                         attention_mask[: len(data)] = 1
-                        md_key = list(self.metadata.keys())[i]
-                        if "attention_mask" in self.metadata[md_key]:
-                            self.metadata[md_key]["attention_mask"] = attention_mask
+                        if "attention_mask" in self.metadata[i]:
+                            self.metadata[i]["attention_mask"] = attention_mask
                         else:
-                            self.metadata[md_key].update(
-                                {"attention_mask": attention_mask}
-                            )
+                            self.metadata[i].update({"attention_mask": attention_mask})
             else:
                 maxlen = (
                     max([len(seq) for seq in self.data]) if max_len is None else max_len
@@ -214,34 +208,31 @@ def pad(self, value=0, max_len=None):
                     if self.has_metadata():
                         attention_mask = np.zeros(result.shape[1], dtype=np.int8)
                         attention_mask[: len(data)] = 1
-                        md_key = list(self.metadata.keys())[i]
-                        if "attention_mask" in self.metadata[md_key]:
-                            self.metadata[md_key]["attention_mask"] = attention_mask
+                        if "attention_mask" in self.metadata[i]:
+                            self.metadata[i]["attention_mask"] = attention_mask
                         else:
-                            self.metadata[md_key].update(
-                                {"attention_mask": attention_mask}
-                            )
+                            self.metadata[i].update({"attention_mask": attention_mask})
         # TODO: this might need to be a new modality (otherwise we loose the original data)
         self.data = result
 
     def get_data_layout(self):
         if self.has_metadata():
-            return list(self.metadata.values())[0]["data_layout"]["representation"]
+            return self.metadata[0]["data_layout"]["representation"]
 
         return None
 
     def has_data(self):
         return self.data is not None and len(self.data) != 0
 
     def has_metadata(self):
-        return self.metadata is not None and self.metadata != {}
+        return self.metadata is not None and len(self.metadata) != 0
 
     def is_aligned(self, other_modality):
         aligned = True
         for i in range(len(self.data)):
             if (
-                list(self.metadata.values())[i]["data_layout"]["shape"]
-                != list(other_modality.metadata.values())[i]["data_layout"]["shape"]
+                self.metadata[i]["data_layout"]["shape"]
+                != other_modality.metadata[i]["data_layout"]["shape"]
             ):
                 aligned = False
                 break