From 203a883552e58a7e1a055e6a92653ae3def22ff7 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Wed, 1 Apr 2026 06:06:01 +0000
Subject: [PATCH 1/8] Fix the num_workers usage.

---
 superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 37d27bf1a..9d34ab9eb 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -651,13 +651,16 @@ def _generate_dataset(self):
                 if self._args.dataset_url:
                     self._raw_data_path = str(Path(self._args.data_home) / 'data.json')
                     download_file(self._args.dataset_url, self._raw_data_path)
+
                     command = (
                         'python3 '
                         f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} '
                         f'--input {self._raw_data_path} '
                         f'--tokenizer-type {self._args.tokenizer_type} '
                         f'--output-prefix {os.path.join(self._args.data_home, "dataset")} '
-                        f'--workers {str(self._args.num_workers)} '
+                        # num_workers=0 is valid for DataLoader (main process loads data),
+                        # but preprocess_data.py requires workers>=1 for multiprocessing.Pool.
+                        f'--workers {max(1, self._args.num_workers)} '
                         f'--vocab-file {self._vocab_path} '
                         f'--merge-file {self._merges_path}'
                     )

From c6d205e4d1e9903470d66fb299547c04b950e253 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Mon, 27 Apr 2026 21:59:47 +0000
Subject: [PATCH 2/8] Address PR review: derive --output-prefix from
 data_prefix, add tests

- Megatron's preprocess_data.py appends '_text_document' to the
  --output-prefix when producing the .bin/.idx files. Derive the
  output-prefix from --data_prefix (stripping a trailing
  '_text_document' suffix when present) so that the generated files
  match the existence checks for any custom data_prefix value, instead
  of being hardcoded to '<data_home>/dataset'.
- Add unit test test_megatron_gpt_dataset_generate_command covering:
  num_workers=0 clamps to '--workers 1' with default data_prefix
  ('dataset_text_document' -> '<data_home>/dataset'), num_workers=4
  with custom 'custom_text_document' -> '<data_home>/custom', and a
  data_prefix without the suffix used as-is.
---
 .../model_benchmarks/megatron_gpt3.py         | 12 +++-
 .../model_benchmarks/test_megatron_gpt.py     | 66 +++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 9d34ab9eb..1143fc2ff 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -652,12 +652,22 @@ def _generate_dataset(self):
                     self._raw_data_path = str(Path(self._args.data_home) / 'data.json')
                     download_file(self._args.dataset_url, self._raw_data_path)
 
+                    # Megatron's preprocess_data.py appends '_text_document' to --output-prefix
+                    # when producing the .bin/.idx files. Derive the output-prefix from
+                    # data_prefix (stripping the '_text_document' suffix when present) so that
+                    # the generated files match the existence checks above for any custom
+                    # data_prefix value.
+                    output_prefix_basename = self._args.data_prefix
+                    if output_prefix_basename.endswith('_text_document'):
+                        output_prefix_basename = output_prefix_basename[:-len('_text_document')]
+                    output_prefix = os.path.join(self._args.data_home, output_prefix_basename)
+
                     command = (
                         'python3 '
                         f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} '
                         f'--input {self._raw_data_path} '
                         f'--tokenizer-type {self._args.tokenizer_type} '
-                        f'--output-prefix {os.path.join(self._args.data_home, "dataset")} '
+                        f'--output-prefix {output_prefix} '
                         # num_workers=0 is valid for DataLoader (main process loads data),
                         # but preprocess_data.py requires workers>=1 for multiprocessing.Pool.
                         f'--workers {max(1, self._args.num_workers)} '
diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index b7c588677..b0efd3483 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -174,6 +174,72 @@ def test_megatron_gpt_dataset(self):
         ret = benchmark._generate_dataset()
         assert (ret is True)
 
+    @mock.patch('superbench.benchmarks.model_benchmarks.megatron_gpt3.run_command')
+    @mock.patch('superbench.benchmarks.model_benchmarks.megatron_gpt3.download_file')
+    def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_run_command):
+        """Verify _generate_dataset clamps --workers to >=1 and derives --output-prefix from data_prefix."""
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
+        assert (benchmark_cls)
+        os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
+        os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
+        os.environ['OMPI_COMM_WORLD_RANK'] = '0'
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '12345'
+
+        # Case 1: num_workers=0 with default data_prefix should produce
+        # '--workers 1' (clamped) and '--output-prefix <data_home>/dataset'
+        # (default data_prefix='dataset_text_document' with the suffix stripped).
+        benchmark = benchmark_cls(
+            self.benchmark_name,
+            parameters=(
+                f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} '
+                f'--batch_size 2048 --num_workers 0 '
+                f'--dataset_url http://example.com/data.json'
+            ),
+        )
+        benchmark._preprocess()
+        ret = benchmark._generate_dataset()
+        # Dataset generation will fail because the mocked run_command does not actually
+        # produce .bin/.idx files; we only care about the constructed command.
+        assert ret is False
+        assert mock_run_command.call_count >= 1
+        cmd = mock_run_command.call_args_list[0].args[0]
+        assert '--workers 1' in cmd, cmd
+        assert f'--output-prefix {os.path.join(self._tmp_dir, "dataset")} ' in cmd, cmd
+
+        # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should
+        # produce '--workers 4' and '--output-prefix <data_home>/custom'.
+        mock_run_command.reset_mock()
+        benchmark = benchmark_cls(
+            self.benchmark_name,
+            parameters=(
+                f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} '
+                f'--batch_size 2048 --num_workers 4 --data_prefix custom_text_document '
+                f'--dataset_url http://example.com/data.json'
+            ),
+        )
+        benchmark._preprocess()
+        benchmark._generate_dataset()
+        cmd = mock_run_command.call_args_list[0].args[0]
+        assert '--workers 4' in cmd, cmd
+        assert f'--output-prefix {os.path.join(self._tmp_dir, "custom")} ' in cmd, cmd
+
+        # Case 3: data_prefix without the '_text_document' suffix is used as-is.
+        mock_run_command.reset_mock()
+        benchmark = benchmark_cls(
+            self.benchmark_name,
+            parameters=(
+                f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} '
+                f'--batch_size 2048 --num_workers 2 --data_prefix mydata '
+                f'--dataset_url http://example.com/data.json'
+            ),
+        )
+        benchmark._preprocess()
+        benchmark._generate_dataset()
+        cmd = mock_run_command.call_args_list[0].args[0]
+        assert '--workers 2' in cmd, cmd
+        assert f'--output-prefix {os.path.join(self._tmp_dir, "mydata")} ' in cmd, cmd
+
     @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
     def test_megatron_gpt_command(self, mock_generate_dataset):
         """Test command generation."""

From ee2839e9d72fec7ee799fd8a1a71deaf90cf9c51 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Mon, 27 Apr 2026 22:03:02 +0000
Subject: [PATCH 3/8] Tighten test substring matches with trailing space

Avoid '--workers 1' matching '--workers 10' etc.
---
 tests/benchmarks/model_benchmarks/test_megatron_gpt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index b0efd3483..45a6b570e 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -204,7 +204,7 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru
         assert ret is False
         assert mock_run_command.call_count >= 1
         cmd = mock_run_command.call_args_list[0].args[0]
-        assert '--workers 1' in cmd, cmd
+        assert '--workers 1 ' in cmd, cmd
         assert f'--output-prefix {os.path.join(self._tmp_dir, "dataset")} ' in cmd, cmd
 
         # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should
@@ -221,7 +221,7 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru
         benchmark._preprocess()
         benchmark._generate_dataset()
         cmd = mock_run_command.call_args_list[0].args[0]
-        assert '--workers 4' in cmd, cmd
+        assert '--workers 4 ' in cmd, cmd
         assert f'--output-prefix {os.path.join(self._tmp_dir, "custom")} ' in cmd, cmd
 
         # Case 3: data_prefix without the '_text_document' suffix is used as-is.
@@ -237,7 +237,7 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru
         benchmark._preprocess()
         benchmark._generate_dataset()
         cmd = mock_run_command.call_args_list[0].args[0]
-        assert '--workers 2' in cmd, cmd
+        assert '--workers 2 ' in cmd, cmd
         assert f'--output-prefix {os.path.join(self._tmp_dir, "mydata")} ' in cmd, cmd
 
     @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')

From b90dbf964c83bda263c3f7007a0bcc447a59a3b6 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Sat, 2 May 2026 00:19:08 +0000
Subject: [PATCH 4/8] Address PR review: use normalize_command and valid
 code_base in dataset generate test

- Replace brittle whitespace substring assertions ('--workers 1 ', '--output-prefix ... ') with normalize_command()-based parsed CLI unit checks, so the test validates semantics rather than formatting.
- Use --code_base {self._tmp_dir} together with createMockFiles(['pretrain_gpt.py']) to avoid the unrealistic /root/Megatron-DeepSpeed path. The mocked run_command now creates the expected .bin/.idx files via side_effect so _preprocess() succeeds end-to-end and is asserted to be True.
---
 .../model_benchmarks/test_megatron_gpt.py     | 104 ++++++++++--------
 1 file changed, 56 insertions(+), 48 deletions(-)

diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index 45a6b570e..7e0b7a6bb 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -186,59 +186,67 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru
         os.environ['MASTER_ADDR'] = 'localhost'
         os.environ['MASTER_PORT'] = '12345'
 
-        # Case 1: num_workers=0 with default data_prefix should produce
-        # '--workers 1' (clamped) and '--output-prefix <data_home>/dataset'
-        # (default data_prefix='dataset_text_document' with the suffix stripped).
-        benchmark = benchmark_cls(
-            self.benchmark_name,
-            parameters=(
-                f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} '
-                f'--batch_size 2048 --num_workers 0 '
-                f'--dataset_url http://example.com/data.json'
-            ),
+        # Use a real, valid code_base so _preprocess() can validate it (avoid hardcoded /root path).
+        self.createMockFiles(['pretrain_gpt.py'])
+
+        # Helper: make run_command's side_effect create the expected .bin/.idx files
+        # so _generate_dataset() (invoked from within _preprocess()) succeeds.
+        created_files = []
+
+        def _make_dataset_files(prefix):
+            def _side_effect(*_args, **_kwargs):
+                for ext in ('.bin', '.idx'):
+                    p = Path(self._tmp_dir) / f'{prefix}{ext}'
+                    p.touch()
+                    created_files.append(p)
+            return _side_effect
+
+        self.addCleanup(lambda: [p.unlink() for p in created_files if p.is_file()])
+
+        def _run_case(extra_params, expected_workers, expected_prefix_basename, expected_data_prefix):
+            mock_run_command.reset_mock()
+            mock_run_command.side_effect = _make_dataset_files(expected_data_prefix)
+            benchmark = benchmark_cls(
+                self.benchmark_name,
+                parameters=(
+                    f'--code_base {self._tmp_dir} --data_home {self._tmp_dir} '
+                    f'--batch_size 2048 --dataset_url http://example.com/data.json '
+                    f'{extra_params}'
+                ),
+            )
+            assert benchmark._preprocess() is True
+            assert mock_run_command.call_count >= 1
+            cmd = mock_run_command.call_args_list[0].args[0]
+            units = normalize_command(cmd)
+            assert f'--workers {expected_workers}' in units, units
+            expected_output_prefix = os.path.join(self._tmp_dir, expected_prefix_basename)
+            assert f'--output-prefix {expected_output_prefix}' in units, units
+
+        # Case 1: num_workers=0 with default data_prefix should produce '--workers 1' (clamped)
+        # and '--output-prefix <data_home>/dataset' (default 'dataset_text_document' suffix stripped).
+        _run_case(
+            extra_params='--num_workers 0',
+            expected_workers=1,
+            expected_prefix_basename='dataset',
+            expected_data_prefix='dataset_text_document',
         )
-        benchmark._preprocess()
-        ret = benchmark._generate_dataset()
-        # Dataset generation will fail because the mocked run_command does not actually
-        # produce .bin/.idx files; we only care about the constructed command.
-        assert ret is False
-        assert mock_run_command.call_count >= 1
-        cmd = mock_run_command.call_args_list[0].args[0]
-        assert '--workers 1 ' in cmd, cmd
-        assert f'--output-prefix {os.path.join(self._tmp_dir, "dataset")} ' in cmd, cmd
-
-        # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should
-        # produce '--workers 4' and '--output-prefix <data_home>/custom'.
-        mock_run_command.reset_mock()
-        benchmark = benchmark_cls(
-            self.benchmark_name,
-            parameters=(
-                f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} '
-                f'--batch_size 2048 --num_workers 4 --data_prefix custom_text_document '
-                f'--dataset_url http://example.com/data.json'
-            ),
+
+        # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should produce
+        # '--workers 4' and '--output-prefix <data_home>/custom'.
+        _run_case(
+            extra_params='--num_workers 4 --data_prefix custom_text_document',
+            expected_workers=4,
+            expected_prefix_basename='custom',
+            expected_data_prefix='custom_text_document',
         )
-        benchmark._preprocess()
-        benchmark._generate_dataset()
-        cmd = mock_run_command.call_args_list[0].args[0]
-        assert '--workers 4 ' in cmd, cmd
-        assert f'--output-prefix {os.path.join(self._tmp_dir, "custom")} ' in cmd, cmd
 
         # Case 3: data_prefix without the '_text_document' suffix is used as-is.
-        mock_run_command.reset_mock()
-        benchmark = benchmark_cls(
-            self.benchmark_name,
-            parameters=(
-                f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} '
-                f'--batch_size 2048 --num_workers 2 --data_prefix mydata '
-                f'--dataset_url http://example.com/data.json'
-            ),
+        _run_case(
+            extra_params='--num_workers 2 --data_prefix mydata',
+            expected_workers=2,
+            expected_prefix_basename='mydata',
+            expected_data_prefix='mydata',
         )
-        benchmark._preprocess()
-        benchmark._generate_dataset()
-        cmd = mock_run_command.call_args_list[0].args[0]
-        assert '--workers 2 ' in cmd, cmd
-        assert f'--output-prefix {os.path.join(self._tmp_dir, "mydata")} ' in cmd, cmd
 
     @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
     def test_megatron_gpt_command(self, mock_generate_dataset):

From 687ac81f995a3fb2c5d8bfeb8b1afa2415c46931 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Sun, 3 May 2026 05:04:57 +0000
Subject: [PATCH 5/8] Address review feedback for num_workers / data_prefix
 handling

- Warn when num_workers is silently clamped from 0 to 1 for the
  preprocess subprocess so the user sees the override in the log.
  The DataLoader still receives the original num_workers value.
- Guard the '_text_document' suffix-strip against the edge case where
  data_prefix == '_text_document' (which would otherwise produce a
  malformed --output-prefix '<data_home>/' with an empty basename).
  Fall back to the original data_prefix value in that case.
- Extend test_megatron_gpt_dataset_generate_command with a 4th case
  asserting the empty-basename fallback.
---
 .../model_benchmarks/megatron_gpt3.py         | 22 +++++++++++++++----
 .../model_benchmarks/test_megatron_gpt.py     | 10 +++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 1143fc2ff..7541aee4b 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -659,18 +659,32 @@ def _generate_dataset(self):
                     # data_prefix value.
                     output_prefix_basename = self._args.data_prefix
                     if output_prefix_basename.endswith('_text_document'):
-                        output_prefix_basename = output_prefix_basename[:-len('_text_document')]
+                        stripped = output_prefix_basename[:-len('_text_document')]
+                        # Guard against data_prefix == '_text_document' which would
+                        # leave an empty basename and produce a malformed --output-prefix
+                        # ending in '/'. Fall back to the original value in that case.
+                        output_prefix_basename = stripped or output_prefix_basename
                     output_prefix = os.path.join(self._args.data_home, output_prefix_basename)
 
+                    # num_workers=0 is valid for DataLoader (main process loads data),
+                    # but preprocess_data.py requires workers>=1 for multiprocessing.Pool.
+                    preprocess_workers = max(1, self._args.num_workers)
+                    if preprocess_workers != self._args.num_workers:
+                        logger.warning(
+                            'preprocess_data.py requires --workers >= 1; '
+                            'overriding num_workers={} to {} for dataset preprocessing only '
+                            '(DataLoader still uses num_workers={}).'.format(
+                                self._args.num_workers, preprocess_workers, self._args.num_workers
+                            )
+                        )
+
                     command = (
                         'python3 '
                         f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} '
                         f'--input {self._raw_data_path} '
                         f'--tokenizer-type {self._args.tokenizer_type} '
                         f'--output-prefix {output_prefix} '
-                        # num_workers=0 is valid for DataLoader (main process loads data),
-                        # but preprocess_data.py requires workers>=1 for multiprocessing.Pool.
-                        f'--workers {max(1, self._args.num_workers)} '
+                        f'--workers {preprocess_workers} '
                         f'--vocab-file {self._vocab_path} '
                         f'--merge-file {self._merges_path}'
                     )
diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index 7e0b7a6bb..e856a8010 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -248,6 +248,16 @@ def _run_case(extra_params, expected_workers, expected_prefix_basename, expected
             expected_data_prefix='mydata',
         )
 
+        # Case 4: edge case - data_prefix == '_text_document' should NOT strip down
+        # to an empty basename (which would produce '--output-prefix <data_home>/').
+        # Fall back to using '_text_document' as the basename.
+        _run_case(
+            extra_params='--num_workers 1 --data_prefix _text_document',
+            expected_workers=1,
+            expected_prefix_basename='_text_document',
+            expected_data_prefix='_text_document',
+        )
+
     @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
     def test_megatron_gpt_command(self, mock_generate_dataset):
         """Test command generation."""

From 9018df049b1c14d93e7c0131b0e9c46151386e01 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Fri, 15 May 2026 20:50:16 +0000
Subject: [PATCH 6/8] fix(megatron-gpt): address CI failures and Copilot review

- Fix test pollution: clean up pretrain_gpt.py created by
  test_megatron_gpt_dataset_generate_command so it does not break the
  alphabetically-later test_megatron_gpt_preprocess (root cause of CI
  cpu-unit-test failures on python-3.7/3.10/3.12).
- Fix Python 3.7 incompatibility: use call_args_list[0][0][0] instead
  of .args[0] (mock.call.args is Python 3.8+).
- Enforce data_prefix contract in _generate_dataset(): validate that
  data_prefix ends with '_text_document' and has a non-empty stem
  before invoking preprocess_data.py. Fail fast with a clear error
  otherwise (preprocess_data.py always appends '_text_document' to
  --output-prefix, so other prefixes would silently produce files
  whose names mismatch the existence check).
- Replace test Cases 3 and 4 (which masked the contract bug via a
  side_effect that lied about preprocess_data.py output) with
  negative cases that assert _preprocess() fails fast and never
  invokes run_command for invalid data_prefix values.
---
 .../model_benchmarks/megatron_gpt3.py         | 28 +++++-----
 .../model_benchmarks/test_megatron_gpt.py     | 51 +++++++++++--------
 2 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 7541aee4b..fec372211 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -630,7 +630,7 @@ def _init_distributed_setting(self):
             f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
         return True
 
-    def _generate_dataset(self):
+    def _generate_dataset(self):    # noqa: C901
         """Generate dataset for benchmarking.
 
         Return:
@@ -653,17 +653,21 @@ def _generate_dataset(self):
                     download_file(self._args.dataset_url, self._raw_data_path)
 
                     # Megatron's preprocess_data.py appends '_text_document' to --output-prefix
-                    # when producing the .bin/.idx files. Derive the output-prefix from
-                    # data_prefix (stripping the '_text_document' suffix when present) so that
-                    # the generated files match the existence checks above for any custom
-                    # data_prefix value.
-                    output_prefix_basename = self._args.data_prefix
-                    if output_prefix_basename.endswith('_text_document'):
-                        stripped = output_prefix_basename[:-len('_text_document')]
-                        # Guard against data_prefix == '_text_document' which would
-                        # leave an empty basename and produce a malformed --output-prefix
-                        # ending in '/'. Fall back to the original value in that case.
-                        output_prefix_basename = stripped or output_prefix_basename
+                    # when producing the .bin/.idx files. For the existence check below
+                    # (which looks for {data_prefix}.bin/.idx) to pass, data_prefix must end
+                    # with '_text_document' and have a non-empty stem when generation is needed.
+                    suffix = '_text_document'
+                    if not self._args.data_prefix.endswith(suffix) or self._args.data_prefix == suffix:
+                        logger.error(
+                            'data_prefix must end with "{}" and have a non-empty stem when '
+                            'dataset generation is required (got "{}"). preprocess_data.py '
+                            'always appends "{}" to --output-prefix.'.format(
+                                suffix, self._args.data_prefix, suffix
+                            )
+                        )
+                        self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
+                        return False
+                    output_prefix_basename = self._args.data_prefix[:-len(suffix)]
                     output_prefix = os.path.join(self._args.data_home, output_prefix_basename)
 
                     # num_workers=0 is valid for DataLoader (main process loads data),
diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index e856a8010..034fbf28f 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -187,7 +187,11 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru
         os.environ['MASTER_PORT'] = '12345'
 
         # Use a real, valid code_base so _preprocess() can validate it (avoid hardcoded /root path).
+        # Clean up after this test so the alphabetically-later test_megatron_gpt_preprocess
+        # (which expects pretrain_gpt.py to NOT exist initially) is not affected by leaked state.
         self.createMockFiles(['pretrain_gpt.py'])
+        pretrain_path = Path(self._tmp_dir) / 'pretrain_gpt.py'
+        self.addCleanup(lambda: pretrain_path.unlink() if pretrain_path.is_file() else None)
 
         # Helper: make run_command's side_effect create the expected .bin/.idx files
         # so _generate_dataset() (invoked from within _preprocess()) succeeds.
@@ -203,10 +207,8 @@ def _side_effect(*_args, **_kwargs):
 
         self.addCleanup(lambda: [p.unlink() for p in created_files if p.is_file()])
 
-        def _run_case(extra_params, expected_workers, expected_prefix_basename, expected_data_prefix):
-            mock_run_command.reset_mock()
-            mock_run_command.side_effect = _make_dataset_files(expected_data_prefix)
-            benchmark = benchmark_cls(
+        def _build_benchmark(extra_params):
+            return benchmark_cls(
                 self.benchmark_name,
                 parameters=(
                     f'--code_base {self._tmp_dir} --data_home {self._tmp_dir} '
@@ -214,14 +216,30 @@ def _run_case(extra_params, expected_workers, expected_prefix_basename, expected
                     f'{extra_params}'
                 ),
             )
+
+        def _run_case(extra_params, expected_workers, expected_prefix_basename, expected_data_prefix):
+            mock_run_command.reset_mock()
+            mock_run_command.side_effect = _make_dataset_files(expected_data_prefix)
+            benchmark = _build_benchmark(extra_params)
             assert benchmark._preprocess() is True
             assert mock_run_command.call_count >= 1
-            cmd = mock_run_command.call_args_list[0].args[0]
+            # Use tuple indexing instead of `.args` for Python 3.7 compatibility
+            # (mock.call.args was added in Python 3.8).
+            cmd = mock_run_command.call_args_list[0][0][0]
             units = normalize_command(cmd)
             assert f'--workers {expected_workers}' in units, units
             expected_output_prefix = os.path.join(self._tmp_dir, expected_prefix_basename)
             assert f'--output-prefix {expected_output_prefix}' in units, units
 
+        def _run_invalid_case(extra_params):
+            """Assert _preprocess() fails fast (no run_command call) for invalid data_prefix."""
+            mock_run_command.reset_mock()
+            mock_run_command.side_effect = None
+            benchmark = _build_benchmark(extra_params)
+            assert benchmark._preprocess() is False
+            assert mock_run_command.call_count == 0
+            assert benchmark.return_code == ReturnCode.DATASET_GENERATION_FAILURE
+
         # Case 1: num_workers=0 with default data_prefix should produce '--workers 1' (clamped)
         # and '--output-prefix <data_home>/dataset' (default 'dataset_text_document' suffix stripped).
         _run_case(
@@ -240,23 +258,14 @@ def _run_case(extra_params, expected_workers, expected_prefix_basename, expected
             expected_data_prefix='custom_text_document',
         )
 
-        # Case 3: data_prefix without the '_text_document' suffix is used as-is.
-        _run_case(
-            extra_params='--num_workers 2 --data_prefix mydata',
-            expected_workers=2,
-            expected_prefix_basename='mydata',
-            expected_data_prefix='mydata',
-        )
+        # Case 3: data_prefix without the '_text_document' suffix is invalid for generation
+        # because preprocess_data.py would produce 'mydata_text_document.bin/.idx' but the
+        # existence check looks for 'mydata.bin/.idx'. _preprocess() must fail fast.
+        _run_invalid_case(extra_params='--num_workers 2 --data_prefix mydata')
 
-        # Case 4: edge case - data_prefix == '_text_document' should NOT strip down
-        # to an empty basename (which would produce '--output-prefix <data_home>/').
-        # Fall back to using '_text_document' as the basename.
-        _run_case(
-            extra_params='--num_workers 1 --data_prefix _text_document',
-            expected_workers=1,
-            expected_prefix_basename='_text_document',
-            expected_data_prefix='_text_document',
-        )
+        # Case 4: data_prefix == '_text_document' has an empty stem after stripping the suffix,
+        # which would produce a malformed '--output-prefix <data_home>/'. Must fail fast.
+        _run_invalid_case(extra_params='--num_workers 1 --data_prefix _text_document')
 
     @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
     def test_megatron_gpt_command(self, mock_generate_dataset):

From d580e6e265eb6b50c2167ad0f1ca1ae2c2b794c7 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Fri, 15 May 2026 21:04:27 +0000
Subject: [PATCH 7/8] style: apply yapf 0.31.0 formatting (CI lint fix)

- Collapse multi-line .format() call in _generate_dataset error message.
- Add blank line before 'return _side_effect' in nested function in test.
---
 superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 4 +---
 tests/benchmarks/model_benchmarks/test_megatron_gpt.py  | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index fec372211..4d869310d 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -661,9 +661,7 @@ def _generate_dataset(self):    # noqa: C901
                         logger.error(
                             'data_prefix must end with "{}" and have a non-empty stem when '
                             'dataset generation is required (got "{}"). preprocess_data.py '
-                            'always appends "{}" to --output-prefix.'.format(
-                                suffix, self._args.data_prefix, suffix
-                            )
+                            'always appends "{}" to --output-prefix.'.format(suffix, self._args.data_prefix, suffix)
                         )
                         self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
                         return False
diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index 034fbf28f..43ff58a68 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -203,6 +203,7 @@ def _side_effect(*_args, **_kwargs):
                     p = Path(self._tmp_dir) / f'{prefix}{ext}'
                     p.touch()
                     created_files.append(p)
+
             return _side_effect
 
         self.addCleanup(lambda: [p.unlink() for p in created_files if p.is_file()])

From a88294d8cb7ca9306747706e837102fd7283ea63 Mon Sep 17 00:00:00 2001
From: Hongtao Zhang <hongtaozhang@microsoft.com>
Date: Wed, 27 May 2026 21:28:04 +0000
Subject: [PATCH 8/8] Harden dataset generation argument validation

---
 .../model_benchmarks/megatron_gpt3.py         | 19 +++++++++++-----
 .../benchmarks/model_benchmarks/model_base.py |  3 ++-
 .../model_benchmarks/test_megatron_gpt.py     | 22 ++++++++++++++++++-
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
index 4d869310d..d61d055c3 100644
--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -649,9 +649,6 @@ def _generate_dataset(self):    # noqa: C901
             if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \
                     or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')):
                 if self._args.dataset_url:
-                    self._raw_data_path = str(Path(self._args.data_home) / 'data.json')
-                    download_file(self._args.dataset_url, self._raw_data_path)
-
                     # Megatron's preprocess_data.py appends '_text_document' to --output-prefix
                     # when producing the .bin/.idx files. For the existence check below
                     # (which looks for {data_prefix}.bin/.idx) to pass, data_prefix must end
@@ -665,13 +662,25 @@ def _generate_dataset(self):    # noqa: C901
                         )
                         self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
                         return False
+
+                    if self._args.num_workers < 0:
+                        logger.error(
+                            'num_workers must be >= 0 (got {}).'.format(self._args.num_workers)
+                        )
+                        self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+                        return False
+
+                    self._raw_data_path = str(Path(self._args.data_home) / 'data.json')
+                    download_file(self._args.dataset_url, self._raw_data_path)
+
                     output_prefix_basename = self._args.data_prefix[:-len(suffix)]
                     output_prefix = os.path.join(self._args.data_home, output_prefix_basename)
 
                     # num_workers=0 is valid for DataLoader (main process loads data),
                     # but preprocess_data.py requires workers>=1 for multiprocessing.Pool.
-                    preprocess_workers = max(1, self._args.num_workers)
-                    if preprocess_workers != self._args.num_workers:
+                    preprocess_workers = self._args.num_workers
+                    if self._args.num_workers == 0:
+                        preprocess_workers = 1
                         logger.warning(
                             'preprocess_data.py requires --workers >= 1; '
                             'overriding num_workers={} to {} for dataset preprocessing only '
diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py
index 3e3cf0443..d847018e5 100644
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -242,7 +242,8 @@ def _preprocess(self):
         self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size
 
         if not self._generate_dataset():
-            self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
+            if self._result.return_code == ReturnCode.SUCCESS:
+                self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
             return False
 
         if not self._init_dataloader():
diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
index 43ff58a68..aece16c08 100644
--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -206,7 +206,12 @@ def _side_effect(*_args, **_kwargs):
 
             return _side_effect
 
-        self.addCleanup(lambda: [p.unlink() for p in created_files if p.is_file()])
+        def _cleanup_created_files():
+            for p in created_files:
+                if p.is_file():
+                    p.unlink()
+
+        self.addCleanup(_cleanup_created_files)
 
         def _build_benchmark(extra_params):
             return benchmark_cls(
@@ -241,6 +246,18 @@ def _run_invalid_case(extra_params):
             assert mock_run_command.call_count == 0
             assert benchmark.return_code == ReturnCode.DATASET_GENERATION_FAILURE
 
+        def _run_invalid_workers_case(extra_params):
+            """Assert _preprocess() rejects negative num_workers as invalid input."""
+            mock_run_command.reset_mock()
+            mock_run_command.side_effect = None
+            mock_download_file.reset_mock()
+            benchmark = _build_benchmark(extra_params)
+            assert benchmark._preprocess() is False
+            assert mock_run_command.call_count == 0
+            # Only vocab + merges are downloaded before validation failure.
+            assert mock_download_file.call_count == 2
+            assert benchmark.return_code == ReturnCode.INVALID_ARGUMENT
+
         # Case 1: num_workers=0 with default data_prefix should produce '--workers 1' (clamped)
         # and '--output-prefix <data_home>/dataset' (default 'dataset_text_document' suffix stripped).
         _run_case(
@@ -268,6 +285,9 @@ def _run_invalid_case(extra_params):
         # which would produce a malformed '--output-prefix <data_home>/'. Must fail fast.
         _run_invalid_case(extra_params='--num_workers 1 --data_prefix _text_document')
 
+        # Case 5: negative num_workers is invalid input and should fail fast.
+        _run_invalid_workers_case(extra_params='--num_workers -1 --data_prefix negative_text_document')
+
     @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
     def test_megatron_gpt_command(self, mock_generate_dataset):
         """Test command generation."""