diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py index e0b9e61a070..1c7ae4f3a85 100644 --- a/src/main/python/systemds/scuro/dataloader/audio_loader.py +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -63,18 +63,17 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): if not self.load_data_from_file: import numpy as np - self.metadata[file] = self.modality_type.create_metadata( - 1000, np.array([0]) - ) + audio = np.array([0]) + sr = 1000 else: audio, sr = librosa.load(file, dtype=self._data_type) if self.normalize: audio = librosa.util.normalize(audio) - self.metadata[file] = self.modality_type.create_metadata(sr, audio) + self.metadata.append(self.modality_type.create_metadata(sr, audio)) - self.data.append(audio) + self.data.append(audio) def get_stats(self, source_path: str): sampling_rate = 0 diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index 88decd641fd..9b89c773942 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -44,9 +44,7 @@ def __init__( (otherwise please provide your own Dataloader that knows about the file name convention) """ self.data = [] - self.metadata = ( - {} - ) # TODO: check what the index should be for storing the metadata (file_name, counter, ...) + self.metadata = [] self.source_path = source_path self.indices = indices self.modality_type = modality_type @@ -87,7 +85,7 @@ def data_type(self, data_type): def reset(self): self._next_chunk = 0 self.data = [] - self.metadata = {} + self.metadata = [] def load(self): """ @@ -134,6 +132,7 @@ def _load_next_chunk(self): Loads the next chunk of data """ self.data = [] + # TODO: Handle metadata correctly next_chunk_indices = self.indices[ self._next_chunk * self._chunk_size : (self._next_chunk + 1) diff --git a/src/main/python/systemds/scuro/dataloader/image_loader.py b/src/main/python/systemds/scuro/dataloader/image_loader.py index 498ae77a897..25e8690cf5a 100644 --- a/src/main/python/systemds/scuro/dataloader/image_loader.py +++ b/src/main/python/systemds/scuro/dataloader/image_loader.py @@ -71,8 +71,8 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): image = image.astype(np.uint8, copy=False) - self.metadata[file] = self.modality_type.create_metadata( - width, height, channels + self.metadata.append( + self.modality_type.create_metadata(width, height, channels) ) self.data.append(image) diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py index f5ffd89ea38..adb3f6aaf64 100644 --- a/src/main/python/systemds/scuro/dataloader/json_loader.py +++ b/src/main/python/systemds/scuro/dataloader/json_loader.py @@ -69,7 +69,9 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): text = " ".join(text) if isinstance(text, list) else text self.data.append(text) - self.metadata[idx] = self.modality_type.create_metadata(len(text), text) + self.metadata.append( + self.modality_type.create_metadata(len(text), text) | json_file[idx] + ) def get_stats(self, source_path: str): self.file_sanity_check(source_path) diff --git a/src/main/python/systemds/scuro/dataloader/pdf_loader.py b/src/main/python/systemds/scuro/dataloader/pdf_loader.py new file mode 100644 index 00000000000..add02e50457 --- /dev/null +++ b/src/main/python/systemds/scuro/dataloader/pdf_loader.py @@ -0,0 +1,70 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from typing import List, Optional, Union +import pymupdf + +import numpy as np + +from systemds.scuro.dataloader.base_loader import BaseLoader +import cv2 +from systemds.scuro.modality.type import ModalityType + + +class PdfLoader(BaseLoader): + def __init__( + self, + source_path: str, + indices: List[str], + data_type: Union[np.dtype, str] = np.float16, + chunk_size: Optional[int] = None, + load=True, + ext=".pdf", + ): + super().__init__( + source_path, indices, data_type, chunk_size, ModalityType.IMAGE, ext + ) + self.load_data_from_file = load + + def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): + self.file_sanity_check(file) + + doc = pymupdf.open(file) + + for i, page in enumerate(doc.pages()): + image_bytes = page.get_pixmap().tobytes("jpg") + np_buffer = np.frombuffer(image_bytes, dtype=np.uint8) + + image = cv2.imdecode(np_buffer, cv2.IMREAD_COLOR) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + if image.ndim == 2: + height, width = image.shape + channels = 1 + else: + height, width, channels = image.shape + + image = image.astype(np.uint8, copy=False) + + self.metadata.append( + self.modality_type.create_metadata(width, height, channels) + ) + + self.data.append(image) diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py index 8b987f68453..f734a080b15 100644 --- a/src/main/python/systemds/scuro/dataloader/text_loader.py +++ b/src/main/python/systemds/scuro/dataloader/text_loader.py @@ -56,8 +56,8 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): if self.prefix: line = re.sub(self.prefix, "", line) line = line.replace("\n", "") - self.metadata[file] = self.modality_type.create_metadata( - len(line.split()), line + self.metadata.append( + self.modality_type.create_metadata(len(line.split()), line) ) self.data.append(line) diff --git a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py index 6e40e8eb08e..7131b55db11 100644 --- a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py +++ b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py @@ -81,15 +81,20 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): data = self._normalize_signals(data) if file: - self.metadata[index] = self.modality_type.create_metadata( - self.signal_names, data, self.sampling_rate + self.metadata.append( + self.modality_type.create_metadata( + self.signal_names, data, self.sampling_rate + ) ) + self.data.append(data) else: for i, index in enumerate(self.indices): - self.metadata[str(index)] = self.modality_type.create_metadata( - self.signal_names, data[i], self.sampling_rate + self.metadata.append( + self.modality_type.create_metadata( + self.signal_names, data[i], self.sampling_rate + ) ) - self.data.append(data) + self.data.append(data[i]) def _normalize_signals(self, data: np.ndarray) -> np.ndarray: if data.ndim == 1: diff --git a/src/main/python/systemds/scuro/dataloader/transcript_loader.py b/src/main/python/systemds/scuro/dataloader/transcript_loader.py new file mode 100644 index 00000000000..67166f2c327 --- /dev/null +++ b/src/main/python/systemds/scuro/dataloader/transcript_loader.py @@ -0,0 +1,59 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from typing import List, Optional, Union +from faster_whisper import WhisperModel +import numpy as np + +from systemds.scuro.dataloader.base_loader import BaseLoader +from systemds.scuro.modality.type import ModalityType + + +class TranscriptLoader(BaseLoader): + def __init__( + self, + source_path: str, + indices: List[str], + data_type: Union[np.dtype, str] = np.float32, + chunk_size: Optional[int] = None, + normalize: bool = True, + transcribe_model_size: str = "medium", + load=True, + ): + super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT) + self.model = WhisperModel( + transcribe_model_size, device="cpu", compute_type="int8" + ) + self.normalize = normalize + self.load_data_from_file = load + + def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): + self.file_sanity_check(file) + segments, _ = self.model.transcribe(file, vad_filter=True) + + for i, seg in enumerate(segments): + md = self.modality_type.create_metadata(len(seg.text.split()), seg.text) + md["timestamp_start"] = seg.start + md["timestamp_end"] = seg.end + md["text"] = seg.text + + self.metadata.append(md) + + self.data.append(seg.text) diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py index e57f685e031..a60b7acc60b 100644 --- a/src/main/python/systemds/scuro/dataloader/video_loader.py +++ b/src/main/python/systemds/scuro/dataloader/video_loader.py @@ -87,8 +87,10 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) num_channels = 3 - self.metadata[file] = self.modality_type.create_metadata( - self.fps, length, width, height, num_channels + self.metadata.append( + self.modality_type.create_metadata( + self.fps, length, width, height, num_channels + ) ) frames = [] diff --git a/src/main/python/systemds/scuro/modality/joined.py b/src/main/python/systemds/scuro/modality/joined.py index 335d1959fdc..544285bb52f 100644 --- a/src/main/python/systemds/scuro/modality/joined.py +++ b/src/main/python/systemds/scuro/modality/joined.py @@ -77,7 +77,8 @@ def execute(self, starting_idx=0): ) for i in range(start, end): - idx_1 = list(self.left_modality.metadata.values())[i + starting_idx][ + left_meta_idx = i if self.chunk_left else i + starting_idx + idx_1 = self.left_modality.metadata[left_meta_idx][ self.condition.leftField ] if ( @@ -90,9 +91,7 @@ def execute(self, starting_idx=0): if self.chunk_left: i = i + starting_idx - idx_2 = list(self.right_modality.metadata.values())[i][ - self.condition.rightField - ] + idx_2 = self.right_modality.metadata[i][self.condition.rightField] self.joined_right.data.append([]) c = 0 @@ -228,8 +227,8 @@ def _handle_chunked_execution(self, representation): def _apply_representation_chunked( self, left_modality, right_modality, chunk_right, representation ): - new_left = Modality(left_modality.modality_type, {}) - new_right = Modality(right_modality.modality_type, {}) + new_left = Modality(left_modality.modality_type) + new_right = Modality(right_modality.modality_type) for _ in left_modality.iter_raw_data_chunks(reset=True): if chunk_right: @@ -246,11 +245,11 @@ def _apply_representation_chunked( self.joined_right, representation ) new_right.data.extend(right_transformed.data) - new_right.metadata.update(right_transformed.metadata) + new_right.metadata.extend(right_transformed.metadata) left_transformed = self._apply_representation(left_modality, representation) new_left.data.extend(left_transformed.data) - new_left.metadata.update(left_transformed.metadata) + new_left.metadata.extend(left_transformed.metadata) new_left.update_metadata() new_right.update_metadata() diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index 1bc8180e199..477e4e45f35 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -33,7 +33,7 @@ def __init__( self, modalityType: ModalityType, modality_id=-1, - metadata={}, + metadata=[], data_type=None, transform_time=0, ): @@ -91,10 +91,10 @@ def update_metadata(self): ): return - for i, (md_k, md_v) in enumerate(self.metadata.items()): + for i, md_v in enumerate(self.metadata): md_v = selective_copy_metadata(md_v) updated_md = self.modality_type.update_metadata(md_v, self.data[i]) - self.metadata[md_k] = updated_md + self.metadata[i] = updated_md if i == 0: self.data_type = updated_md["data_layout"]["type"] @@ -160,13 +160,10 @@ def pad(self, value=0, max_len=None): if self.has_metadata(): attention_mask = np.zeros(maxlen, dtype=np.int8) attention_mask[: len(data)] = 1 - md_key = list(self.metadata.keys())[i] - if "attention_mask" in self.metadata[md_key]: - self.metadata[md_key]["attention_mask"] = attention_mask + if "attention_mask" in self.metadata[i]: + self.metadata[i]["attention_mask"] = attention_mask else: - self.metadata[md_key].update( - {"attention_mask": attention_mask} - ) + self.metadata[i].update({"attention_mask": attention_mask}) elif ( isinstance(first, list) and len(first) > 0 @@ -190,13 +187,10 @@ def pad(self, value=0, max_len=None): if self.has_metadata(): attention_mask = np.zeros(maxlen, dtype=np.int8) attention_mask[: len(data)] = 1 - md_key = list(self.metadata.keys())[i] - if "attention_mask" in self.metadata[md_key]: - self.metadata[md_key]["attention_mask"] = attention_mask + if "attention_mask" in self.metadata[i]: + self.metadata[i]["attention_mask"] = attention_mask else: - self.metadata[md_key].update( - {"attention_mask": attention_mask} - ) + self.metadata[i].update({"attention_mask": attention_mask}) else: maxlen = ( max([len(seq) for seq in self.data]) if max_len is None else max_len @@ -214,19 +208,16 @@ def pad(self, value=0, max_len=None): if self.has_metadata(): attention_mask = np.zeros(result.shape[1], dtype=np.int8) attention_mask[: len(data)] = 1 - md_key = list(self.metadata.keys())[i] - if "attention_mask" in self.metadata[md_key]: - self.metadata[md_key]["attention_mask"] = attention_mask + if "attention_mask" in self.metadata[i]: + self.metadata[i]["attention_mask"] = attention_mask else: - self.metadata[md_key].update( - {"attention_mask": attention_mask} - ) + self.metadata[i].update({"attention_mask": attention_mask}) # TODO: this might need to be a new modality (otherwise we loose the original data) self.data = result def get_data_layout(self): if self.has_metadata(): - return list(self.metadata.values())[0]["data_layout"]["representation"] + return self.metadata[0]["data_layout"]["representation"] return None @@ -234,14 +225,14 @@ def has_data(self): return self.data is not None and len(self.data) != 0 def has_metadata(self): - return self.metadata is not None and self.metadata != {} + return self.metadata is not None and len(self.metadata) != 0 def is_aligned(self, other_modality): aligned = True for i in range(len(self.data)): if ( - list(self.metadata.values())[i]["data_layout"]["shape"] - != list(other_modality.metadata.values())[i]["data_layout"]["shape"] + self.metadata[i]["data_layout"]["shape"] + != other_modality.metadata[i]["data_layout"]["shape"] ): aligned = False break diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py index e185deb7c97..eaaa7a20322 100644 --- a/src/main/python/systemds/scuro/modality/transformed.py +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -66,9 +66,9 @@ def __init__( self.aggregate_dim = aggregate_dim if modality.__class__.__name__ == "UnimodalModality": - for k, v in self.metadata.items(): - if "attention_masks" in v: - del self.metadata[k]["attention_masks"] + for m in self.metadata: + if "attention_masks" in m: + del m["attention_masks"] def copy_from_instance(self): """ @@ -82,8 +82,7 @@ def calculate_memory_usage(self): data_bytes += self._estimate_data_bytes(instance) md_bytes = 0 - for key, value in self.metadata.items(): - md_bytes += self._estimate_data_bytes(key) + for value in self.metadata: md_bytes += self._estimate_data_bytes(value) total_bytes = ( diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py index 9c883efec32..0493edf5bdd 100644 --- a/src/main/python/systemds/scuro/modality/type.py +++ b/src/main/python/systemds/scuro/modality/type.py @@ -212,7 +212,7 @@ def get_schema(self): return ModalitySchemas.get(self.name) def has_field(self, md, field): - for value in md.values(): + for value in md: if field in value: return True else: @@ -221,7 +221,7 @@ def has_field(self, md, field): def get_field_for_instances(self, md, field): data = [] - for items in md.values(): + for items in md: data.append(self.get_field(items, field)) return data @@ -242,8 +242,8 @@ def add_field(self, md, field, data): return md def add_field_for_instances(self, md, field, data): - for key, value in zip(md.keys(), data): - md[key].update({field: value}) + for i, value in enumerate(data): + md[i].update({field: value}) return md diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 319599d6807..84204ac570d 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -41,7 +41,7 @@ def __init__(self, data_loader: BaseLoader): super().__init__( data_loader.modality_type, Identifier().new_id(), - {}, + [], data_loader.data_type, ) self.data_loader = data_loader @@ -56,10 +56,11 @@ def copy_from_instance(self): def get_metadata_at_position(self, position: int): if self.data_loader.chunk_size: return self.metadata[ - self.data_loader.chunk_size * self.data_loader.next_chunk + position + (self.data_loader.next_chunk - 1) * self.data_loader.chunk_size + + position ] - return self.metadata[self.dataIndex][position] + return self.metadata[position] def get_stats(self): return self.stats @@ -165,7 +166,7 @@ def apply_representations(self, representations, aggregation=None): ].data.extend(transformed_chunk.data) transformed_modalities_per_representation[ representation.name - ].metadata.update(transformed_chunk.metadata) + ].metadata.extend(transformed_chunk.metadata) for d in transformed_chunk.data: original_lengths_per_representation[representation.name].append( d.shape[0] diff --git a/src/main/python/systemds/scuro/representations/ orb_alignment.py b/src/main/python/systemds/scuro/representations/ orb_alignment.py new file mode 100644 index 00000000000..5594f3bc54d --- /dev/null +++ b/src/main/python/systemds/scuro/representations/ orb_alignment.py @@ -0,0 +1,51 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from systemds.scuro.representations.alignment import Alignment +from dataclasses import dataclass +import numpy as np +import cv2 as cv + + +@dataclass +class OrbDescriptor: + kp: object + desc: object + + +class OrbAlignment(Alignment): + def __init__(self): + self.orb = cv.ORB_create() + self.bfm = cv.BFMatcher(cv.NORM_HAMMING, crossCheck=True) + super().__init("OrbAlignment") + + def compute_descriptor(self, segment): + return [OrbDescriptor(self.orb.detectAndCompute(segment, None))] + + def compare(self, a, b): + if a.desc is None or b.desc is None: + return float("inf") + matches = bfm.match(a.desc, b.desc) + good_matches = [m for m in matches if m.distance < 40] + + if len(good_matches) == 0: + return float(inf) + + return np.median([m.distance for m in good_matches]) diff --git a/src/main/python/systemds/scuro/representations/alignment.py b/src/main/python/systemds/scuro/representations/alignment.py new file mode 100644 index 00000000000..42e6df0b205 --- /dev/null +++ b/src/main/python/systemds/scuro/representations/alignment.py @@ -0,0 +1,136 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from dataclasses import dataclass +from abc import ABC, abstractmethod +from collections import defaultdict +import copy + + +@dataclass +class Match: + primary: int + secondary: int + distance: float + + +class Alignment(ABC): + def __init__(self, name): + self.name = name + + def execute(self, primary_modality, secondary_modality): + primary_descriptor_collections = self._batch_compute_descriptors( + primary_modality + ) + secondary_descriptor_collections = self._batch_compute_descriptors( + secondary_modality + ) + + matches = [] + + for p, p_collection in enumerate(primary_descriptor_collections): + stats = defaultdict( + lambda: { + "count": 0, + "total_distance": 0.0, + "best_distance": float("inf"), + } + ) + + for p_desc in p_collection: + best_secondary = None + best_dist = float("inf") + + for s, s_collection in enumerate(secondary_descriptor_collections): + for s_desc in s_collection: + dist = self.compare(p_desc, s_desc) + + if dist < best_dist: + best_dist = dist + best_secondary = s + + if best_secondary is not None: + stats[best_secondary]["count"] += 1 + stats[best_secondary]["total_distance"] += best_dist + stats[best_secondary]["best_distance"] = min( + stats[best_secondary]["best_distance"], best_dist + ) + + if not stats: + matches.append(Match(p, None, float("inf"))) + continue + + best_match = min( + stats.items(), + key=lambda item: ( + -item[1]["count"], # mehr Votes ist besser + item[1]["total_distance"], # kleinere Gesamtdistanz ist besser + item[1]["best_distance"], # optional weiterer Tie-Breaker + ), + )[0] + + result_distance = ( + stats[best_match]["total_distance"] / stats[best_match]["count"] + ) + + matches.append(Match(p, best_match, result_distance)) + + return matches + + @staticmethod + def apply_matching(alignment, secondary_modality): + aligned_modality = copy.deepcopy(secondary_modality) + aligned_modality.data = [None] * len(alignment) + aligned_modality.metadata = [None] * len(alignment) + + for match in alignment: + aligned_modality.data[match.primary] = secondary_modality.data[ + match.secondary + ] + aligned_modality.metadata[match.primary] = secondary_modality.metadata[ + match.secondary + ] + + return aligned_modality + + def _batch_compute_descriptors(self, modality): + descriptors = [] + + if modality.data_loader.chunk_size: + modality.data_loader.reset() + while modality.data_loader.next_chunk < modality.data_loader.num_chunks: + modality.extract_raw_data() + for d in modality.data: + descriptors.append(self.compute_descriptor(d)) + else: + if not modality.has_data(): + modality.extract_raw_data() + for d in modality.data: + descriptors.append(self.compute_descriptor(d)) + + return descriptors + + @abstractmethod + def compute_descriptor(self, segment): + pass + + @abstractmethod + def compare(self, a, b): + pass diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py index b13d2dfeb85..518cc1eb5dc 100644 --- a/src/main/python/systemds/scuro/representations/clip.py +++ b/src/main/python/systemds/scuro/representations/clip.py @@ -243,6 +243,9 @@ def create_visual_embeddings(self, modality): with torch.no_grad(): output = self.model.get_image_features(**inputs) + if hasattr(output, "pooler_output"): + output = output.pooler_output + if len(output.shape) > 2: output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1)) diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py index a7ca905f477..5d53690317e 100644 --- a/src/main/python/systemds/scuro/representations/concatenation.py +++ b/src/main/python/systemds/scuro/representations/concatenation.py @@ -44,9 +44,7 @@ def execute(self, modalities: List[Modality]): if len(modalities) == 1: return np.asarray( modalities[0].data, - dtype=modalities[0].metadata[list(modalities[0].metadata.keys())[0]][ - "data_layout" - ]["type"], + dtype=modalities[0].metadata[0]["data_layout"]["type"], ) max_emb_size = self.get_max_embedding_size(modalities) @@ -64,9 +62,7 @@ def execute(self, modalities: List[Modality]): data, np.asarray( other_modality, - dtype=modality.metadata[list(modality.metadata.keys())[0]][ - "data_layout" - ]["type"], + dtype=modality.metadata[0]["data_layout"]["type"], ), ], axis=-1, diff --git a/src/main/python/systemds/scuro/representations/contrastive_learning.py b/src/main/python/systemds/scuro/representations/contrastive_learning.py new file mode 100644 index 00000000000..3599079e6f7 --- /dev/null +++ b/src/main/python/systemds/scuro/representations/contrastive_learning.py @@ -0,0 +1,81 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import copy + + +class ContrasitveLearning: + @staticmethod + def execute( + input_first_modality, + input_second_modality, + input_first_extensions, + input_second_extensions, + metadata_matching_function, + ): + # Add check for same dimensionality of input modlities and extensions + + def empty_modality_copy(input_modality): + modality = copy.deepcopy(input_modality) + if isinstance(modality, list): + for m in modality: + m.data = [] + m.metadata = [] + else: + modality.data = [] + modality.metadata = [] + + return modality + + first_modality = empty_modality_copy(input_first_modality) + second_modality = empty_modality_copy(input_second_modality) + first_extensions = empty_modality_copy(input_first_extensions) + second_extensions = empty_modality_copy(input_second_extensions) + + labels = [] + + for i in range(len(input_first_modality.data)): + for j in range(len(input_second_modality.data)): + first_modality.data.append(input_first_modality.data[i]) + first_modality.metadata.append(input_first_modality.metadata[i]) + + for m, input_m in zip(first_extensions, input_first_extensions): + m.data.append(input_m.data[i]) + m.metadata.append(input_m.metadata[i]) + + second_modality.data.append(input_second_modality.data[j]) + second_modality.metadata.append(input_second_modality.metadata[j]) + + for m, input_m in zip(second_extensions, input_second_extensions): + m.data.append(input_m.data[j]) + m.metadata.append(input_m.metadata[j]) + + if metadata_matching_function( + input_first_modality.metadata[i], input_second_modality.metadata[j] + ): + labels.append(True) + else: + labels.append(False) + + return ( + [first_modality] + first_extensions, + [second_modality] + second_extensions, + labels, + ) diff --git a/src/main/python/systemds/scuro/representations/covarep_audio_features.py b/src/main/python/systemds/scuro/representations/covarep_audio_features.py index 28dd6fc2978..01098ef4ee1 100644 --- a/src/main/python/systemds/scuro/representations/covarep_audio_features.py +++ b/src/main/python/systemds/scuro/representations/covarep_audio_features.py @@ -47,7 +47,7 @@ def transform(self, modality, aggregation=None): ) result = [] for i, y in enumerate(modality.data): - sr = list(modality.metadata.values())[i]["frequency"] + sr = modality.metadata[i]["frequency"] spectral_centroid = librosa.feature.spectral_centroid( y=y, sr=sr, hop_length=self.hop_length @@ -222,7 +222,7 @@ def transform(self, modality, aggregation=None): ) result = [] for i, y in enumerate(modality.data): - sr = list(modality.metadata.values())[i]["frequency"] + sr = modality.metadata[i]["frequency"] pitches, magnitudes = librosa.piptrack( y=y, sr=sr, hop_length=self.hop_length diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py index e067e2ccf95..104f2727e69 100644 --- a/src/main/python/systemds/scuro/representations/lstm.py +++ b/src/main/python/systemds/scuro/representations/lstm.py @@ -92,7 +92,7 @@ def _prepare_data(self, modalities: List[Modality]) -> np.ndarray: data = np.array(modality.data) except: max_len = -1 - for md in modality.metadata.values(): + for md in modality.metadata: if max_len < md["data_layout"]["shape"][0]: max_len = md["data_layout"]["shape"][0] data = np.zeros((len(modality.data), max_len)) diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 67ff955a98f..46e5045b2eb 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -57,10 +57,9 @@ def transform(self, modality, aggregation=None): modality, self, self.output_modality_type ) result = [] - metadata_values = list(modality.metadata.values()) for i, sample in enumerate(modality.data): - sr = metadata_values[i]["frequency"] + sr = modality.metadata[i]["frequency"] computed_feature = self.compute_feature(sample, sr) result.append(computed_feature) diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py index 5e5b54b0795..737a3dffe95 100644 --- a/src/main/python/systemds/scuro/representations/mfcc.py +++ b/src/main/python/systemds/scuro/representations/mfcc.py @@ -63,7 +63,7 @@ def transform(self, modality, aggregation=None): result = [] for i, sample in enumerate(modality.data): - sr = list(modality.metadata.values())[i]["frequency"] + sr = modality.metadata[i]["frequency"] computed_feature = self.compute_feature(sample, sr) result.append(computed_feature) diff --git a/src/main/python/systemds/scuro/representations/pHash_alignment.py b/src/main/python/systemds/scuro/representations/pHash_alignment.py new file mode 100644 index 00000000000..f70a12a770f --- /dev/null +++ b/src/main/python/systemds/scuro/representations/pHash_alignment.py @@ -0,0 +1,43 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from systemds.scuro.representations.alignment import Alignment +import cv2 as cv +import numpy as np + + +class PHashAlignment(Alignment): + def __init__(self): + super().__init__("pHashAlignment") + self.hasher = cv.img_hash.PHash_create() + + def compute_descriptor(self, segment): + if segment.ndim == 3: + return [self.hasher.compute(segment)] + if segment.ndim == 4: # For videos + descriptors = [] + for s in segment: + frame = (s * 255).astype(np.uint8, copy=True) + descriptors.append(self.hasher.compute(frame)) + return descriptors + raise ("PHashAlignment is only implemented for ndim=3 or ndim=4") + + def compare(self, a, b): + return self.hasher.compare(a, b) diff --git a/src/main/python/systemds/scuro/representations/sum.py b/src/main/python/systemds/scuro/representations/sum.py index e6187b4050d..d6c4fe659b2 100644 --- a/src/main/python/systemds/scuro/representations/sum.py +++ b/src/main/python/systemds/scuro/representations/sum.py @@ -43,17 +43,13 @@ def __init__(self, params=None): def execute(self, modalities: List[Modality]): data = np.asarray( modalities[0].data, - dtype=modalities[0].metadata[list(modalities[0].metadata.keys())[0]][ - "data_layout" - ]["type"], + dtype=modalities[0].metadata[0]["data_layout"]["type"], ) for m in range(1, len(modalities)): data += np.asarray( modalities[m].data, - dtype=modalities[m].metadata[list(modalities[m].metadata.keys())[0]][ - "data_layout" - ]["type"], + dtype=modalities[m].metadata[0]["data_layout"]["type"], ) return data diff --git a/src/main/python/systemds/scuro/representations/text_context_with_indices.py b/src/main/python/systemds/scuro/representations/text_context_with_indices.py index 095e9f81063..1a341af1e3a 100644 --- a/src/main/python/systemds/scuro/representations/text_context_with_indices.py +++ b/src/main/python/systemds/scuro/representations/text_context_with_indices.py @@ -208,7 +208,7 @@ def execute(self, modality): List of lists, where each inner list contains text chunks (strings) """ - for instance, metadata in zip(modality.data, modality.metadata.values()): + for instance, metadata in zip(modality.data, modality.metadata): text = _extract_text(instance) if not text: ModalityType.TEXT.add_field(metadata, "text_spans", [(0, 0)]) @@ -344,7 +344,7 @@ def execute(self, modality): List of tuples, where each tuple contains start and end index to the text chunks """ - for instance, metadata in zip(modality.data, modality.metadata.values()): + for instance, metadata in zip(modality.data, modality.metadata): text = _extract_text(instance) if not text: ModalityType.TEXT.add_field(metadata, "text_spans", [(0, 0)]) diff --git a/src/main/python/systemds/scuro/representations/timeseries_representations.py b/src/main/python/systemds/scuro/representations/timeseries_representations.py index bb4ea4f49cb..80f6880a0b8 100644 --- a/src/main/python/systemds/scuro/representations/timeseries_representations.py +++ b/src/main/python/systemds/scuro/representations/timeseries_representations.py @@ -52,7 +52,7 @@ def transform(self, modality, aggregation=None): result.append(feature) transformed_modality.data = np.vstack(np.array(result)).astype( - modality.metadata[list(modality.metadata.keys())[0]]["data_layout"]["type"] + modality.metadata[0]["data_layout"]["type"] ) return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/wav2vec.py b/src/main/python/systemds/scuro/representations/wav2vec.py index c599550be19..38dcb848436 100644 --- a/src/main/python/systemds/scuro/representations/wav2vec.py +++ b/src/main/python/systemds/scuro/representations/wav2vec.py @@ -52,7 +52,7 @@ def transform(self, modality, aggregation=None): result = [] for i, sample in enumerate(modality.data): - sr = list(modality.metadata.values())[i]["frequency"] + sr = modality.metadata[i]["frequency"] audio_resampled = librosa.resample( np.array(sample), orig_sr=sr, target_sr=16000 ) diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py index c9622f1a2ad..a34b6ebe4c8 100644 --- a/src/main/python/systemds/scuro/representations/window_aggregation.py +++ b/src/main/python/systemds/scuro/representations/window_aggregation.py @@ -224,7 +224,7 @@ def execute(self, modality): ) windowed_data = np.array(padded_features) - data_type = list(modality.metadata.values())[0]["data_layout"]["type"] + data_type = modality.metadata[0]["data_layout"]["type"] if data_type != "str": windowed_data = windowed_data.astype(data_type) diff --git a/src/main/python/systemds/scuro/utils/schema_helpers.py b/src/main/python/systemds/scuro/utils/schema_helpers.py index 3d1fbf4d71a..929e9c7f4f3 100644 --- a/src/main/python/systemds/scuro/utils/schema_helpers.py +++ b/src/main/python/systemds/scuro/utils/schema_helpers.py @@ -43,4 +43,4 @@ def calculate_new_frequency(new_length, old_length, old_frequency): def get_shape(metadata): - return len(list(metadata.values())[0]["data_layout"]["shape"]) + return len(metadata[0]["data_layout"]["shape"]) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index b78ea314833..a0b43fc8593 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -51,6 +51,7 @@ class TestDataLoader(BaseLoader): def __init__(self, indices, chunk_size, modality_type, data, data_type, metadata): super().__init__("", indices, data_type, chunk_size, modality_type) + self._full_metadata = metadata self.metadata = metadata self.test_data = data if modality_type == ModalityType.TEXT: @@ -110,8 +111,10 @@ def reset(self): def extract(self, file, indices): if isinstance(self.test_data, list): self.data = [self.test_data[i] for i in indices] + self.metadata = [self._full_metadata[i] for i in indices] else: self.data = self.test_data[indices] + self.metadata = [self._full_metadata[i] for i in indices] class ModalityRandomDataGenerator: @@ -120,7 +123,7 @@ def __init__(self): np.random.seed(4) self.modality_id = 0 self.modality_type = None - self.metadata = {} + self.metadata = [] self.data_type = np.float32 self.transform_time = 0 self.stats = None @@ -186,21 +189,22 @@ def create1DModality( # TODO: write a dummy method to create the same metadata for all instances to avoid the for loop self.modality_type = modality_type + self.metadata = [] for i in range(num_instances): if modality_type == ModalityType.AUDIO: - self.metadata[i] = modality_type.create_metadata( + self.metadata.append(modality_type.create_metadata( num_features / 10, data[i] - ) + )) elif modality_type == ModalityType.TEXT: - self.metadata[i] = modality_type.create_metadata( + self.metadata.append(modality_type.create_metadata( num_features / 10, data[i] - ) + )) elif modality_type == ModalityType.VIDEO: - self.metadata[i] = modality_type.create_metadata( + self.metadata.append(modality_type.create_metadata( num_features / 30, 10, 0, 0, 1 - ) + )) elif modality_type == ModalityType.TIMESERIES: - self.metadata[i] = modality_type.create_metadata(["test"], data[i]) + self.metadata.append(modality_type.create_metadata(["test"], data[i])) else: raise NotImplementedError @@ -222,10 +226,10 @@ def create_audio_data(self, num_instances, max_audio_length): for i in range(num_instances): data[i] = np.array(data[i]).astype(self.data_type) - self.metadata = { - i: self.modality_type.create_metadata(16000, np.array(data[i])) + self.metadata = [ + self.modality_type.create_metadata(16000, np.array(data[i])) for i in range(num_instances) - } + ] return data, self.metadata @@ -237,12 +241,12 @@ def create_timeseries_data(self, num_instances, sequence_length, num_features=1) ] if num_features == 1: data = [d.squeeze(-1) for d in data] - self.metadata = { - i: self.modality_type.create_metadata( + self.metadata = [ + self.modality_type.create_metadata( [f"feature_{j}" for j in range(num_features)], data[i] ) for i in range(num_instances) - } + ] return data, self.metadata def create_text_data(self, num_instances, num_sentences_per_instance=1): @@ -308,10 +312,10 @@ def create_text_data(self, num_instances, num_sentences_per_instance=1): sentence += f" {verb} {obj}{punct}" sentences.append(sentence) - self.metadata = { - i: self.modality_type.create_metadata(len(sentences[i]), sentences[i]) + self.metadata = [ + self.modality_type.create_metadata(len(sentences[i]), sentences[i]) for i in range(num_instances) - } + ] return sentences, self.metadata @@ -321,9 +325,9 @@ def create_3d_modality(self, num_instances, dims=(100, 28, 28)): np.random.rand(dims[0], dims[1], dims[2]).astype(self.data_type) for _ in range(num_instances) ] - self.metadata = { - i: self.modality_type.create_metadata(data[i]) for i in range(num_instances) - } + self.metadata = [ + self.modality_type.create_metadata(data[i]) for i in range(num_instances) + ] return data, self.metadata def create_2d_modality(self, num_instances, dims=(100, 28)): @@ -332,9 +336,9 @@ def create_2d_modality(self, num_instances, dims=(100, 28)): np.random.rand(dims[0], dims[1]).astype(self.data_type) for _ in range(num_instances) ] - self.metadata = { - i: self.modality_type.create_metadata(data[i]) for i in range(num_instances) - } + self.metadata = [ + self.modality_type.create_metadata(data[i]) for i in range(num_instances) + ] return data, self.metadata def create_visual_modality( @@ -356,12 +360,12 @@ def create_visual_modality( for _ in range(num_instances) ] - self.metadata = { - i: self.modality_type.create_metadata( + self.metadata = [ + self.modality_type.create_metadata( 30, data[i].shape[0], width, height, color_channels ) for i in range(num_instances) - } + ] else: self.modality_type = ModalityType.IMAGE data = [ @@ -373,10 +377,10 @@ def create_visual_modality( ) for _ in range(num_instances) ] - self.metadata = { - i: self.modality_type.create_metadata(width, height, color_channels) + self.metadata = [ + self.modality_type.create_metadata(width, height, color_channels) for i in range(num_instances) - } + ] return data, self.metadata diff --git a/src/main/python/tests/scuro/test_text_context_operators.py b/src/main/python/tests/scuro/test_text_context_operators.py index ffa702b7c82..0b8ad36c160 100644 --- a/src/main/python/tests/scuro/test_text_context_operators.py +++ b/src/main/python/tests/scuro/test_text_context_operators.py @@ -83,7 +83,7 @@ def test_sentence_boundary_split_indices(self): sentence_boundary_split = SentenceBoundarySplitIndices(10, min_words=4) sentence_boundary_split.execute(self.text_modality) for instance, md in zip( - self.text_modality.data, self.text_modality.metadata.values() + self.text_modality.data, self.text_modality.metadata ): for chunk in md["text_spans"]: text = instance[chunk[0] : chunk[1]].split(" ") @@ -95,7 +95,7 @@ def test_overlapping_split_indices(self): overlapping_split = OverlappingSplitIndices(40, 0.1) overlapping_split.execute(self.text_modality) for instance, md in zip( - self.text_modality.data, self.text_modality.metadata.values() + self.text_modality.data, self.text_modality.metadata ): prev_chunk = (0, 0) for j, chunk in enumerate(md["text_spans"]):