From 203a883552e58a7e1a055e6a92653ae3def22ff7 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 1 Apr 2026 06:06:01 +0000 Subject: [PATCH 1/8] Fix the num_workers usage. --- superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 37d27bf1a..9d34ab9eb 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -651,13 +651,16 @@ def _generate_dataset(self): if self._args.dataset_url: self._raw_data_path = str(Path(self._args.data_home) / 'data.json') download_file(self._args.dataset_url, self._raw_data_path) + command = ( 'python3 ' f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' f'--input {self._raw_data_path} ' f'--tokenizer-type {self._args.tokenizer_type} ' f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' - f'--workers {str(self._args.num_workers)} ' + # num_workers=0 is valid for DataLoader (main process loads data), + # but preprocess_data.py requires workers>=1 for multiprocessing.Pool. + f'--workers {max(1, self._args.num_workers)} ' f'--vocab-file {self._vocab_path} ' f'--merge-file {self._merges_path}' ) From c6d205e4d1e9903470d66fb299547c04b950e253 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Mon, 27 Apr 2026 21:59:47 +0000 Subject: [PATCH 2/8] Address PR review: derive --output-prefix from data_prefix, add tests - Megatron's preprocess_data.py appends '_text_document' to the --output-prefix when producing the .bin/.idx files. Derive the output-prefix from --data_prefix (stripping a trailing '_text_document' suffix when present) so that the generated files match the existence checks for any custom data_prefix value, instead of being hardcoded to '/dataset'. - Add unit test test_megatron_gpt_dataset_generate_command covering: num_workers=0 clamps to '--workers 1' with default data_prefix ('dataset_text_document' -> '/dataset'), num_workers=4 with custom 'custom_text_document' -> '/custom', and a data_prefix without the suffix used as-is. --- .../model_benchmarks/megatron_gpt3.py | 12 +++- .../model_benchmarks/test_megatron_gpt.py | 66 +++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 9d34ab9eb..1143fc2ff 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -652,12 +652,22 @@ def _generate_dataset(self): self._raw_data_path = str(Path(self._args.data_home) / 'data.json') download_file(self._args.dataset_url, self._raw_data_path) + # Megatron's preprocess_data.py appends '_text_document' to --output-prefix + # when producing the .bin/.idx files. Derive the output-prefix from + # data_prefix (stripping the '_text_document' suffix when present) so that + # the generated files match the existence checks above for any custom + # data_prefix value. + output_prefix_basename = self._args.data_prefix + if output_prefix_basename.endswith('_text_document'): + output_prefix_basename = output_prefix_basename[:-len('_text_document')] + output_prefix = os.path.join(self._args.data_home, output_prefix_basename) + command = ( 'python3 ' f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' f'--input {self._raw_data_path} ' f'--tokenizer-type {self._args.tokenizer_type} ' - f'--output-prefix {os.path.join(self._args.data_home, "dataset")} ' + f'--output-prefix {output_prefix} ' # num_workers=0 is valid for DataLoader (main process loads data), # but preprocess_data.py requires workers>=1 for multiprocessing.Pool. f'--workers {max(1, self._args.num_workers)} ' diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index b7c588677..b0efd3483 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -174,6 +174,72 @@ def test_megatron_gpt_dataset(self): ret = benchmark._generate_dataset() assert (ret is True) + @mock.patch('superbench.benchmarks.model_benchmarks.megatron_gpt3.run_command') + @mock.patch('superbench.benchmarks.model_benchmarks.megatron_gpt3.download_file') + def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_run_command): + """Verify _generate_dataset clamps --workers to >=1 and derives --output-prefix from data_prefix.""" + (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA) + assert (benchmark_cls) + os.environ['OMPI_COMM_WORLD_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1' + os.environ['OMPI_COMM_WORLD_RANK'] = '0' + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' + + # Case 1: num_workers=0 with default data_prefix should produce + # '--workers 1' (clamped) and '--output-prefix /dataset' + # (default data_prefix='dataset_text_document' with the suffix stripped). + benchmark = benchmark_cls( + self.benchmark_name, + parameters=( + f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' + f'--batch_size 2048 --num_workers 0 ' + f'--dataset_url http://example.com/data.json' + ), + ) + benchmark._preprocess() + ret = benchmark._generate_dataset() + # Dataset generation will fail because the mocked run_command does not actually + # produce .bin/.idx files; we only care about the constructed command. + assert ret is False + assert mock_run_command.call_count >= 1 + cmd = mock_run_command.call_args_list[0].args[0] + assert '--workers 1' in cmd, cmd + assert f'--output-prefix {os.path.join(self._tmp_dir, "dataset")} ' in cmd, cmd + + # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should + # produce '--workers 4' and '--output-prefix /custom'. + mock_run_command.reset_mock() + benchmark = benchmark_cls( + self.benchmark_name, + parameters=( + f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' + f'--batch_size 2048 --num_workers 4 --data_prefix custom_text_document ' + f'--dataset_url http://example.com/data.json' + ), + ) + benchmark._preprocess() + benchmark._generate_dataset() + cmd = mock_run_command.call_args_list[0].args[0] + assert '--workers 4' in cmd, cmd + assert f'--output-prefix {os.path.join(self._tmp_dir, "custom")} ' in cmd, cmd + + # Case 3: data_prefix without the '_text_document' suffix is used as-is. + mock_run_command.reset_mock() + benchmark = benchmark_cls( + self.benchmark_name, + parameters=( + f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' + f'--batch_size 2048 --num_workers 2 --data_prefix mydata ' + f'--dataset_url http://example.com/data.json' + ), + ) + benchmark._preprocess() + benchmark._generate_dataset() + cmd = mock_run_command.call_args_list[0].args[0] + assert '--workers 2' in cmd, cmd + assert f'--output-prefix {os.path.join(self._tmp_dir, "mydata")} ' in cmd, cmd + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_gpt_command(self, mock_generate_dataset): """Test command generation.""" From ee2839e9d72fec7ee799fd8a1a71deaf90cf9c51 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Mon, 27 Apr 2026 22:03:02 +0000 Subject: [PATCH 3/8] Tighten test substring matches with trailing space Avoid '--workers 1' matching '--workers 10' etc. --- tests/benchmarks/model_benchmarks/test_megatron_gpt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index b0efd3483..45a6b570e 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -204,7 +204,7 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru assert ret is False assert mock_run_command.call_count >= 1 cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 1' in cmd, cmd + assert '--workers 1 ' in cmd, cmd assert f'--output-prefix {os.path.join(self._tmp_dir, "dataset")} ' in cmd, cmd # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should @@ -221,7 +221,7 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru benchmark._preprocess() benchmark._generate_dataset() cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 4' in cmd, cmd + assert '--workers 4 ' in cmd, cmd assert f'--output-prefix {os.path.join(self._tmp_dir, "custom")} ' in cmd, cmd # Case 3: data_prefix without the '_text_document' suffix is used as-is. @@ -237,7 +237,7 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru benchmark._preprocess() benchmark._generate_dataset() cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 2' in cmd, cmd + assert '--workers 2 ' in cmd, cmd assert f'--output-prefix {os.path.join(self._tmp_dir, "mydata")} ' in cmd, cmd @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') From b90dbf964c83bda263c3f7007a0bcc447a59a3b6 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Sat, 2 May 2026 00:19:08 +0000 Subject: [PATCH 4/8] Address PR review: use normalize_command and valid code_base in dataset generate test - Replace brittle whitespace substring assertions ('--workers 1 ', '--output-prefix ... ') with normalize_command()-based parsed CLI unit checks, so the test validates semantics rather than formatting. - Use --code_base {self._tmp_dir} together with createMockFiles(['pretrain_gpt.py']) to avoid the unrealistic /root/Megatron-DeepSpeed path. The mocked run_command now creates the expected .bin/.idx files via side_effect so _preprocess() succeeds end-to-end and is asserted to be True. --- .../model_benchmarks/test_megatron_gpt.py | 104 ++++++++++-------- 1 file changed, 56 insertions(+), 48 deletions(-) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 45a6b570e..7e0b7a6bb 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -186,59 +186,67 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12345' - # Case 1: num_workers=0 with default data_prefix should produce - # '--workers 1' (clamped) and '--output-prefix /dataset' - # (default data_prefix='dataset_text_document' with the suffix stripped). - benchmark = benchmark_cls( - self.benchmark_name, - parameters=( - f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' - f'--batch_size 2048 --num_workers 0 ' - f'--dataset_url http://example.com/data.json' - ), + # Use a real, valid code_base so _preprocess() can validate it (avoid hardcoded /root path). + self.createMockFiles(['pretrain_gpt.py']) + + # Helper: make run_command's side_effect create the expected .bin/.idx files + # so _generate_dataset() (invoked from within _preprocess()) succeeds. + created_files = [] + + def _make_dataset_files(prefix): + def _side_effect(*_args, **_kwargs): + for ext in ('.bin', '.idx'): + p = Path(self._tmp_dir) / f'{prefix}{ext}' + p.touch() + created_files.append(p) + return _side_effect + + self.addCleanup(lambda: [p.unlink() for p in created_files if p.is_file()]) + + def _run_case(extra_params, expected_workers, expected_prefix_basename, expected_data_prefix): + mock_run_command.reset_mock() + mock_run_command.side_effect = _make_dataset_files(expected_data_prefix) + benchmark = benchmark_cls( + self.benchmark_name, + parameters=( + f'--code_base {self._tmp_dir} --data_home {self._tmp_dir} ' + f'--batch_size 2048 --dataset_url http://example.com/data.json ' + f'{extra_params}' + ), + ) + assert benchmark._preprocess() is True + assert mock_run_command.call_count >= 1 + cmd = mock_run_command.call_args_list[0].args[0] + units = normalize_command(cmd) + assert f'--workers {expected_workers}' in units, units + expected_output_prefix = os.path.join(self._tmp_dir, expected_prefix_basename) + assert f'--output-prefix {expected_output_prefix}' in units, units + + # Case 1: num_workers=0 with default data_prefix should produce '--workers 1' (clamped) + # and '--output-prefix /dataset' (default 'dataset_text_document' suffix stripped). + _run_case( + extra_params='--num_workers 0', + expected_workers=1, + expected_prefix_basename='dataset', + expected_data_prefix='dataset_text_document', ) - benchmark._preprocess() - ret = benchmark._generate_dataset() - # Dataset generation will fail because the mocked run_command does not actually - # produce .bin/.idx files; we only care about the constructed command. - assert ret is False - assert mock_run_command.call_count >= 1 - cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 1 ' in cmd, cmd - assert f'--output-prefix {os.path.join(self._tmp_dir, "dataset")} ' in cmd, cmd - - # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should - # produce '--workers 4' and '--output-prefix /custom'. - mock_run_command.reset_mock() - benchmark = benchmark_cls( - self.benchmark_name, - parameters=( - f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' - f'--batch_size 2048 --num_workers 4 --data_prefix custom_text_document ' - f'--dataset_url http://example.com/data.json' - ), + + # Case 2: num_workers=4 with custom data_prefix='custom_text_document' should produce + # '--workers 4' and '--output-prefix /custom'. + _run_case( + extra_params='--num_workers 4 --data_prefix custom_text_document', + expected_workers=4, + expected_prefix_basename='custom', + expected_data_prefix='custom_text_document', ) - benchmark._preprocess() - benchmark._generate_dataset() - cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 4 ' in cmd, cmd - assert f'--output-prefix {os.path.join(self._tmp_dir, "custom")} ' in cmd, cmd # Case 3: data_prefix without the '_text_document' suffix is used as-is. - mock_run_command.reset_mock() - benchmark = benchmark_cls( - self.benchmark_name, - parameters=( - f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} ' - f'--batch_size 2048 --num_workers 2 --data_prefix mydata ' - f'--dataset_url http://example.com/data.json' - ), + _run_case( + extra_params='--num_workers 2 --data_prefix mydata', + expected_workers=2, + expected_prefix_basename='mydata', + expected_data_prefix='mydata', ) - benchmark._preprocess() - benchmark._generate_dataset() - cmd = mock_run_command.call_args_list[0].args[0] - assert '--workers 2 ' in cmd, cmd - assert f'--output-prefix {os.path.join(self._tmp_dir, "mydata")} ' in cmd, cmd @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_gpt_command(self, mock_generate_dataset): From 687ac81f995a3fb2c5d8bfeb8b1afa2415c46931 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Sun, 3 May 2026 05:04:57 +0000 Subject: [PATCH 5/8] Address review feedback for num_workers / data_prefix handling - Warn when num_workers is silently clamped from 0 to 1 for the preprocess subprocess so the user sees the override in the log. The DataLoader still receives the original num_workers value. - Guard the '_text_document' suffix-strip against the edge case where data_prefix == '_text_document' (which would otherwise produce a malformed --output-prefix '/' with an empty basename). Fall back to the original data_prefix value in that case. - Extend test_megatron_gpt_dataset_generate_command with a 4th case asserting the empty-basename fallback. --- .../model_benchmarks/megatron_gpt3.py | 22 +++++++++++++++---- .../model_benchmarks/test_megatron_gpt.py | 10 +++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 1143fc2ff..7541aee4b 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -659,18 +659,32 @@ def _generate_dataset(self): # data_prefix value. output_prefix_basename = self._args.data_prefix if output_prefix_basename.endswith('_text_document'): - output_prefix_basename = output_prefix_basename[:-len('_text_document')] + stripped = output_prefix_basename[:-len('_text_document')] + # Guard against data_prefix == '_text_document' which would + # leave an empty basename and produce a malformed --output-prefix + # ending in '/'. Fall back to the original value in that case. + output_prefix_basename = stripped or output_prefix_basename output_prefix = os.path.join(self._args.data_home, output_prefix_basename) + # num_workers=0 is valid for DataLoader (main process loads data), + # but preprocess_data.py requires workers>=1 for multiprocessing.Pool. + preprocess_workers = max(1, self._args.num_workers) + if preprocess_workers != self._args.num_workers: + logger.warning( + 'preprocess_data.py requires --workers >= 1; ' + 'overriding num_workers={} to {} for dataset preprocessing only ' + '(DataLoader still uses num_workers={}).'.format( + self._args.num_workers, preprocess_workers, self._args.num_workers + ) + ) + command = ( 'python3 ' f'{os.path.join(self._args.code_base, "tools/preprocess_data.py")} ' f'--input {self._raw_data_path} ' f'--tokenizer-type {self._args.tokenizer_type} ' f'--output-prefix {output_prefix} ' - # num_workers=0 is valid for DataLoader (main process loads data), - # but preprocess_data.py requires workers>=1 for multiprocessing.Pool. - f'--workers {max(1, self._args.num_workers)} ' + f'--workers {preprocess_workers} ' f'--vocab-file {self._vocab_path} ' f'--merge-file {self._merges_path}' ) diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 7e0b7a6bb..e856a8010 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -248,6 +248,16 @@ def _run_case(extra_params, expected_workers, expected_prefix_basename, expected expected_data_prefix='mydata', ) + # Case 4: edge case - data_prefix == '_text_document' should NOT strip down + # to an empty basename (which would produce '--output-prefix /'). + # Fall back to using '_text_document' as the basename. + _run_case( + extra_params='--num_workers 1 --data_prefix _text_document', + expected_workers=1, + expected_prefix_basename='_text_document', + expected_data_prefix='_text_document', + ) + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_gpt_command(self, mock_generate_dataset): """Test command generation.""" From 9018df049b1c14d93e7c0131b0e9c46151386e01 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Fri, 15 May 2026 20:50:16 +0000 Subject: [PATCH 6/8] fix(megatron-gpt): address CI failures and Copilot review - Fix test pollution: clean up pretrain_gpt.py created by test_megatron_gpt_dataset_generate_command so it does not break the alphabetically-later test_megatron_gpt_preprocess (root cause of CI cpu-unit-test failures on python-3.7/3.10/3.12). - Fix Python 3.7 incompatibility: use call_args_list[0][0][0] instead of .args[0] (mock.call.args is Python 3.8+). - Enforce data_prefix contract in _generate_dataset(): validate that data_prefix ends with '_text_document' and has a non-empty stem before invoking preprocess_data.py. Fail fast with a clear error otherwise (preprocess_data.py always appends '_text_document' to --output-prefix, so other prefixes would silently produce files whose names mismatch the existence check). - Replace test Cases 3 and 4 (which masked the contract bug via a side_effect that lied about preprocess_data.py output) with negative cases that assert _preprocess() fails fast and never invokes run_command for invalid data_prefix values. --- .../model_benchmarks/megatron_gpt3.py | 28 +++++----- .../model_benchmarks/test_megatron_gpt.py | 51 +++++++++++-------- 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 7541aee4b..fec372211 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -630,7 +630,7 @@ def _init_distributed_setting(self): f'--node_rank {node_rank} --master_addr {addr} --master_port {port}' return True - def _generate_dataset(self): + def _generate_dataset(self): # noqa: C901 """Generate dataset for benchmarking. Return: @@ -653,17 +653,21 @@ def _generate_dataset(self): download_file(self._args.dataset_url, self._raw_data_path) # Megatron's preprocess_data.py appends '_text_document' to --output-prefix - # when producing the .bin/.idx files. Derive the output-prefix from - # data_prefix (stripping the '_text_document' suffix when present) so that - # the generated files match the existence checks above for any custom - # data_prefix value. - output_prefix_basename = self._args.data_prefix - if output_prefix_basename.endswith('_text_document'): - stripped = output_prefix_basename[:-len('_text_document')] - # Guard against data_prefix == '_text_document' which would - # leave an empty basename and produce a malformed --output-prefix - # ending in '/'. Fall back to the original value in that case. - output_prefix_basename = stripped or output_prefix_basename + # when producing the .bin/.idx files. For the existence check below + # (which looks for {data_prefix}.bin/.idx) to pass, data_prefix must end + # with '_text_document' and have a non-empty stem when generation is needed. + suffix = '_text_document' + if not self._args.data_prefix.endswith(suffix) or self._args.data_prefix == suffix: + logger.error( + 'data_prefix must end with "{}" and have a non-empty stem when ' + 'dataset generation is required (got "{}"). preprocess_data.py ' + 'always appends "{}" to --output-prefix.'.format( + suffix, self._args.data_prefix, suffix + ) + ) + self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) + return False + output_prefix_basename = self._args.data_prefix[:-len(suffix)] output_prefix = os.path.join(self._args.data_home, output_prefix_basename) # num_workers=0 is valid for DataLoader (main process loads data), diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index e856a8010..034fbf28f 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -187,7 +187,11 @@ def test_megatron_gpt_dataset_generate_command(self, mock_download_file, mock_ru os.environ['MASTER_PORT'] = '12345' # Use a real, valid code_base so _preprocess() can validate it (avoid hardcoded /root path). + # Clean up after this test so the alphabetically-later test_megatron_gpt_preprocess + # (which expects pretrain_gpt.py to NOT exist initially) is not affected by leaked state. self.createMockFiles(['pretrain_gpt.py']) + pretrain_path = Path(self._tmp_dir) / 'pretrain_gpt.py' + self.addCleanup(lambda: pretrain_path.unlink() if pretrain_path.is_file() else None) # Helper: make run_command's side_effect create the expected .bin/.idx files # so _generate_dataset() (invoked from within _preprocess()) succeeds. @@ -203,10 +207,8 @@ def _side_effect(*_args, **_kwargs): self.addCleanup(lambda: [p.unlink() for p in created_files if p.is_file()]) - def _run_case(extra_params, expected_workers, expected_prefix_basename, expected_data_prefix): - mock_run_command.reset_mock() - mock_run_command.side_effect = _make_dataset_files(expected_data_prefix) - benchmark = benchmark_cls( + def _build_benchmark(extra_params): + return benchmark_cls( self.benchmark_name, parameters=( f'--code_base {self._tmp_dir} --data_home {self._tmp_dir} ' @@ -214,14 +216,30 @@ def _run_case(extra_params, expected_workers, expected_prefix_basename, expected f'{extra_params}' ), ) + + def _run_case(extra_params, expected_workers, expected_prefix_basename, expected_data_prefix): + mock_run_command.reset_mock() + mock_run_command.side_effect = _make_dataset_files(expected_data_prefix) + benchmark = _build_benchmark(extra_params) assert benchmark._preprocess() is True assert mock_run_command.call_count >= 1 - cmd = mock_run_command.call_args_list[0].args[0] + # Use tuple indexing instead of `.args` for Python 3.7 compatibility + # (mock.call.args was added in Python 3.8). + cmd = mock_run_command.call_args_list[0][0][0] units = normalize_command(cmd) assert f'--workers {expected_workers}' in units, units expected_output_prefix = os.path.join(self._tmp_dir, expected_prefix_basename) assert f'--output-prefix {expected_output_prefix}' in units, units + def _run_invalid_case(extra_params): + """Assert _preprocess() fails fast (no run_command call) for invalid data_prefix.""" + mock_run_command.reset_mock() + mock_run_command.side_effect = None + benchmark = _build_benchmark(extra_params) + assert benchmark._preprocess() is False + assert mock_run_command.call_count == 0 + assert benchmark.return_code == ReturnCode.DATASET_GENERATION_FAILURE + # Case 1: num_workers=0 with default data_prefix should produce '--workers 1' (clamped) # and '--output-prefix /dataset' (default 'dataset_text_document' suffix stripped). _run_case( @@ -240,23 +258,14 @@ def _run_case(extra_params, expected_workers, expected_prefix_basename, expected expected_data_prefix='custom_text_document', ) - # Case 3: data_prefix without the '_text_document' suffix is used as-is. - _run_case( - extra_params='--num_workers 2 --data_prefix mydata', - expected_workers=2, - expected_prefix_basename='mydata', - expected_data_prefix='mydata', - ) + # Case 3: data_prefix without the '_text_document' suffix is invalid for generation + # because preprocess_data.py would produce 'mydata_text_document.bin/.idx' but the + # existence check looks for 'mydata.bin/.idx'. _preprocess() must fail fast. + _run_invalid_case(extra_params='--num_workers 2 --data_prefix mydata') - # Case 4: edge case - data_prefix == '_text_document' should NOT strip down - # to an empty basename (which would produce '--output-prefix /'). - # Fall back to using '_text_document' as the basename. - _run_case( - extra_params='--num_workers 1 --data_prefix _text_document', - expected_workers=1, - expected_prefix_basename='_text_document', - expected_data_prefix='_text_document', - ) + # Case 4: data_prefix == '_text_document' has an empty stem after stripping the suffix, + # which would produce a malformed '--output-prefix /'. Must fail fast. + _run_invalid_case(extra_params='--num_workers 1 --data_prefix _text_document') @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_gpt_command(self, mock_generate_dataset): From d580e6e265eb6b50c2167ad0f1ca1ae2c2b794c7 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Fri, 15 May 2026 21:04:27 +0000 Subject: [PATCH 7/8] style: apply yapf 0.31.0 formatting (CI lint fix) - Collapse multi-line .format() call in _generate_dataset error message. - Add blank line before 'return _side_effect' in nested function in test. --- superbench/benchmarks/model_benchmarks/megatron_gpt3.py | 4 +--- tests/benchmarks/model_benchmarks/test_megatron_gpt.py | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index fec372211..4d869310d 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -661,9 +661,7 @@ def _generate_dataset(self): # noqa: C901 logger.error( 'data_prefix must end with "{}" and have a non-empty stem when ' 'dataset generation is required (got "{}"). preprocess_data.py ' - 'always appends "{}" to --output-prefix.'.format( - suffix, self._args.data_prefix, suffix - ) + 'always appends "{}" to --output-prefix.'.format(suffix, self._args.data_prefix, suffix) ) self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) return False diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 034fbf28f..43ff58a68 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -203,6 +203,7 @@ def _side_effect(*_args, **_kwargs): p = Path(self._tmp_dir) / f'{prefix}{ext}' p.touch() created_files.append(p) + return _side_effect self.addCleanup(lambda: [p.unlink() for p in created_files if p.is_file()]) From a88294d8cb7ca9306747706e837102fd7283ea63 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 27 May 2026 21:28:04 +0000 Subject: [PATCH 8/8] Harden dataset generation argument validation --- .../model_benchmarks/megatron_gpt3.py | 19 +++++++++++----- .../benchmarks/model_benchmarks/model_base.py | 3 ++- .../model_benchmarks/test_megatron_gpt.py | 22 ++++++++++++++++++- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py index 4d869310d..d61d055c3 100644 --- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py +++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py @@ -649,9 +649,6 @@ def _generate_dataset(self): # noqa: C901 if not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.bin')) \ or not os.path.exists(os.path.join(self._args.data_home, f'{self._args.data_prefix}.idx')): if self._args.dataset_url: - self._raw_data_path = str(Path(self._args.data_home) / 'data.json') - download_file(self._args.dataset_url, self._raw_data_path) - # Megatron's preprocess_data.py appends '_text_document' to --output-prefix # when producing the .bin/.idx files. For the existence check below # (which looks for {data_prefix}.bin/.idx) to pass, data_prefix must end @@ -665,13 +662,25 @@ def _generate_dataset(self): # noqa: C901 ) self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) return False + + if self._args.num_workers < 0: + logger.error( + 'num_workers must be >= 0 (got {}).'.format(self._args.num_workers) + ) + self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) + return False + + self._raw_data_path = str(Path(self._args.data_home) / 'data.json') + download_file(self._args.dataset_url, self._raw_data_path) + output_prefix_basename = self._args.data_prefix[:-len(suffix)] output_prefix = os.path.join(self._args.data_home, output_prefix_basename) # num_workers=0 is valid for DataLoader (main process loads data), # but preprocess_data.py requires workers>=1 for multiprocessing.Pool. - preprocess_workers = max(1, self._args.num_workers) - if preprocess_workers != self._args.num_workers: + preprocess_workers = self._args.num_workers + if self._args.num_workers == 0: + preprocess_workers = 1 logger.warning( 'preprocess_data.py requires --workers >= 1; ' 'overriding num_workers={} to {} for dataset preprocessing only ' diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py index 3e3cf0443..d847018e5 100644 --- a/superbench/benchmarks/model_benchmarks/model_base.py +++ b/superbench/benchmarks/model_benchmarks/model_base.py @@ -242,7 +242,8 @@ def _preprocess(self): self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size if not self._generate_dataset(): - self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) + if self._result.return_code == ReturnCode.SUCCESS: + self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) return False if not self._init_dataloader(): diff --git a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py index 43ff58a68..aece16c08 100644 --- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py +++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py @@ -206,7 +206,12 @@ def _side_effect(*_args, **_kwargs): return _side_effect - self.addCleanup(lambda: [p.unlink() for p in created_files if p.is_file()]) + def _cleanup_created_files(): + for p in created_files: + if p.is_file(): + p.unlink() + + self.addCleanup(_cleanup_created_files) def _build_benchmark(extra_params): return benchmark_cls( @@ -241,6 +246,18 @@ def _run_invalid_case(extra_params): assert mock_run_command.call_count == 0 assert benchmark.return_code == ReturnCode.DATASET_GENERATION_FAILURE + def _run_invalid_workers_case(extra_params): + """Assert _preprocess() rejects negative num_workers as invalid input.""" + mock_run_command.reset_mock() + mock_run_command.side_effect = None + mock_download_file.reset_mock() + benchmark = _build_benchmark(extra_params) + assert benchmark._preprocess() is False + assert mock_run_command.call_count == 0 + # Only vocab + merges are downloaded before validation failure. + assert mock_download_file.call_count == 2 + assert benchmark.return_code == ReturnCode.INVALID_ARGUMENT + # Case 1: num_workers=0 with default data_prefix should produce '--workers 1' (clamped) # and '--output-prefix /dataset' (default 'dataset_text_document' suffix stripped). _run_case( @@ -268,6 +285,9 @@ def _run_invalid_case(extra_params): # which would produce a malformed '--output-prefix /'. Must fail fast. _run_invalid_case(extra_params='--num_workers 1 --data_prefix _text_document') + # Case 5: negative num_workers is invalid input and should fail fast. + _run_invalid_workers_case(extra_params='--num_workers -1 --data_prefix negative_text_document') + @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') def test_megatron_gpt_command(self, mock_generate_dataset): """Test command generation."""