From d4d14d1a07b01d0e5ff644892b0d75197d455dc2 Mon Sep 17 00:00:00 2001 From: dbrkn Date: Mon, 8 Jun 2026 19:34:15 +0300 Subject: [PATCH 1/2] Support character-level WER for CJK languages --- .../word_error_metrics/word_error_metrics.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/openbench/metric/word_error_metrics/word_error_metrics.py b/src/openbench/metric/word_error_metrics/word_error_metrics.py index 4cef2a1..3d5d648 100644 --- a/src/openbench/metric/word_error_metrics/word_error_metrics.py +++ b/src/openbench/metric/word_error_metrics/word_error_metrics.py @@ -61,6 +61,19 @@ def __init__( def _supports_paired_evaluation(self) -> bool: return True + @staticmethod + def _is_character_level(words: list[str]) -> bool: + """Detect character-level tokenization (CJK) by checking if most tokens are single chars.""" + if not words: + return False + single_char_ratio = sum(1 for w in words if len(w) == 1) / len(words) + return single_char_ratio > 0.5 + + @staticmethod + def _split_to_chars(words: list[str]) -> list[str]: + """Split word tokens into individual characters, stripping whitespace.""" + return [ch for w in words for ch in w.strip() if ch.strip()] + def _get_word_error_metrics( self, reference: Transcript, hypothesis: Transcript ) -> tuple[ @@ -81,6 +94,10 @@ def _get_word_error_metrics( speakers=hyp_speakers, ) + if self._is_character_level(ref_words): + hyp_words = self._split_to_chars(hyp_words) + hyp_speakers = None + result = jiwer.compute_measures( truth=" ".join(ref_words), hypothesis=" ".join(hyp_words), From 2bdefcea0af81e1fc70332fcb3b4fe959e080d32 Mon Sep 17 00:00:00 2001 From: dbrkn Date: Mon, 8 Jun 2026 20:10:47 +0300 Subject: [PATCH 2/2] Add a dedicated CharacterErrorRate metric that splits both reference and hypothesis tokens into individual characters before alignment. This is suitable for CJK languages (Chinese, Japanese, Korean) where word boundaries are not marked by spaces. --- src/openbench/metric/metric.py | 5 + .../word_error_metrics/word_error_metrics.py | 105 +++++++++++++++--- 2 files changed, 93 insertions(+), 17 deletions(-) diff --git a/src/openbench/metric/metric.py b/src/openbench/metric/metric.py index 559712f..ccf1478 100644 --- a/src/openbench/metric/metric.py +++ b/src/openbench/metric/metric.py @@ -63,6 +63,11 @@ class MetricOptions(Enum): # Ref: https://en.wikipedia.org/wiki/Word_error_rate WER = "wer" + # Character Error Rate + # Evaluates transcription accuracy at character level, suitable for CJK languages + # Ref: https://en.wikipedia.org/wiki/Word_error_rate (applied at character granularity) + CER = "cer" + # Concatenated minimum-Permutation Word Error Rate # Evaluates multi-speaker transcription by finding the optimal speaker permutation # Ref: https://arxiv.org/abs/2004.09249 diff --git a/src/openbench/metric/word_error_metrics/word_error_metrics.py b/src/openbench/metric/word_error_metrics/word_error_metrics.py index 3d5d648..ee6651f 100644 --- a/src/openbench/metric/word_error_metrics/word_error_metrics.py +++ b/src/openbench/metric/word_error_metrics/word_error_metrics.py @@ -61,19 +61,6 @@ def __init__( def _supports_paired_evaluation(self) -> bool: return True - @staticmethod - def _is_character_level(words: list[str]) -> bool: - """Detect character-level tokenization (CJK) by checking if most tokens are single chars.""" - if not words: - return False - single_char_ratio = sum(1 for w in words if len(w) == 1) / len(words) - return single_char_ratio > 0.5 - - @staticmethod - def _split_to_chars(words: list[str]) -> list[str]: - """Split word tokens into individual characters, stripping whitespace.""" - return [ch for w in words for ch in w.strip() if ch.strip()] - def _get_word_error_metrics( self, reference: Transcript, hypothesis: Transcript ) -> tuple[ @@ -94,10 +81,6 @@ def _get_word_error_metrics( speakers=hyp_speakers, ) - if self._is_character_level(ref_words): - hyp_words = self._split_to_chars(hyp_words) - hyp_speakers = None - result = jiwer.compute_measures( truth=" ".join(ref_words), hypothesis=" ".join(hyp_words), @@ -314,6 +297,94 @@ def compute_metric(self, detail: Details) -> float: return (S + D + I) / N if N > 0 else 0.0 +def _split_to_chars(words: list[str]) -> list[str]: + """Split word-level tokens into individual characters, stripping whitespace.""" + return [ch for w in words for ch in w.strip() if ch.strip()] + + +@MetricRegistry.register_metric( + ( + PipelineType.TRANSCRIPTION, + PipelineType.ORCHESTRATION, + PipelineType.STREAMING_TRANSCRIPTION, + ), + MetricOptions.CER, +) +class CharacterErrorRate(BaseWordErrorMetric): + """Character Error Rate (CER) implementation. + + This metric evaluates transcription accuracy at the character level. + Both reference and hypothesis tokens are split into individual characters + before alignment, making it suitable for CJK languages (Chinese, Japanese, + Korean) where word boundaries are not marked by spaces. + + CER = (S + D + I) / N (same formula as WER, applied to characters) + """ + + @classmethod + def metric_name(cls) -> str: + return "cer" + + @classmethod + def metric_components(cls) -> MetricComponents: + return [ + "num_substitutions", + "num_deletions", + "num_insertions", + "num_characters", + ] + + def compute_components( + self, + reference: Transcript, + hypothesis: Transcript, + **kwargs, + ) -> dict[str, int]: + ref_words, _ = parse_diarzed_words(reference) + hyp_words, _ = parse_diarzed_words(hypothesis) + + if self.use_text_normalizer: + ref_words, _ = self.text_normalizer(words=ref_words, speakers=None) + hyp_words, _ = self.text_normalizer(words=hyp_words, speakers=None) + + ref_chars = _split_to_chars(ref_words) + hyp_chars = _split_to_chars(hyp_words) + + result = jiwer.compute_measures( + truth=" ".join(ref_chars), + hypothesis=" ".join(hyp_chars), + ) + result = AlignmentMetrics(**result) + alignments = result.ops[0] + + num_substitutions = 0 + num_deletions = 0 + num_insertions = 0 + + for alignment in alignments: + if alignment.type == "substitute": + num_substitutions += alignment.ref_end_idx - alignment.ref_start_idx + elif alignment.type == "delete": + num_deletions += alignment.ref_end_idx - alignment.ref_start_idx + elif alignment.type == "insert": + num_insertions += alignment.hyp_end_idx - alignment.hyp_start_idx + + return { + "num_substitutions": num_substitutions, + "num_deletions": num_deletions, + "num_insertions": num_insertions, + "num_characters": len(ref_chars), + } + + def compute_metric(self, detail: Details) -> float: + S = detail["num_substitutions"] + D = detail["num_deletions"] + I = detail["num_insertions"] # noqa: E741 + N = detail["num_characters"] + + return (S + D + I) / N if N > 0 else 0.0 + + @MetricRegistry.register_metric(PipelineType.ORCHESTRATION, MetricOptions.CPWER) class ConcatenatedMinimumPermutationWER(BaseWordErrorMetric): """Concatenated minimum-Permutation Word Error Rate (cpWER) implementation.