From b5c4886c238e5393900c61f1f5d82378f4cf7e61 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 11 Jul 2024 11:55:43 +0000 Subject: [PATCH 1/8] update omegaconf version --- setup.py | 2 +- superbench/config/default.yaml | 1 + superbench/runner/runner.py | 32 ++++++++++++++++---------------- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/setup.py b/setup.py index a0b859c14..efa10ef12 100644 --- a/setup.py +++ b/setup.py @@ -164,7 +164,7 @@ def run(self): 'natsort>=7.1.1', 'networkx>=2.5', 'numpy>=1.19.2', - 'omegaconf==2.0.6', + 'omegaconf==2.3.0', 'openpyxl>=3.0.7', 'packaging>=21.0', 'pandas>=1.1.5', diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 9533806cd..bd9f82010 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -208,6 +208,7 @@ superbench: batch_size: 1 precision: int8 megatron-gpt: + enable: false modes: - name: mpi proc_num: 1 diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 7e29f4dfe..3d3291e6a 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -67,24 +67,24 @@ def __validate_sb_config(self): # noqa: C901 InvalidConfigError: If input config is invalid. """ # TODO: add validation and defaulting - if not self._sb_config.superbench.env: + if 'env' not in self._sb_config.superbench: self._sb_config.superbench.env = {} for name in self._sb_benchmarks: - if not self._sb_benchmarks[name].modes: + if 'modes' not in self._sb_benchmarks[name].modes: self._sb_benchmarks[name].modes = [] for idx, mode in enumerate(self._sb_benchmarks[name].modes): - if not mode.env: + if 'env' not in mode: self._sb_benchmarks[name].modes[idx].env = {} if mode.name == 'local': - if not mode.proc_num: + if 'proc_num' not in mode: self._sb_benchmarks[name].modes[idx].proc_num = 1 - if not mode.prefix: + if 'prefix' not in mode: self._sb_benchmarks[name].modes[idx].prefix = '' elif mode.name == 'torch.distributed': - if not mode.proc_num: + if 'proc_num' not in mode: self._sb_benchmarks[name].modes[idx].proc_num = 8 elif mode.name == 'mpi': - if not mode.mca: + if 'machinefile' not in mode: self._sb_benchmarks[name].modes[idx].mca = { 'pml': 'ob1', 'btl': '^openib', @@ -93,8 +93,8 @@ def __validate_sb_config(self): # noqa: C901 } for key in ['PATH', 'LD_LIBRARY_PATH', 'SB_MICRO_PATH', 'SB_WORKSPACE']: self._sb_benchmarks[name].modes[idx].env.setdefault(key, None) - if mode.pattern: - if mode.pattern.type == 'topo-aware' and not mode.pattern.ibstat: + if 'pattern' in mode: + if mode.pattern.type == 'topo-aware' and 'ibstat' not in mode.pattern: self._sb_benchmarks[name].modes[idx].pattern.ibstat = gen_ibstat( self._ansible_config, str(self._output_path / 'ibstate_file.txt') ) @@ -141,7 +141,7 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): elif mode.name == 'torch.distributed': # TODO: replace with torch.distributed.run in v1.9 # TODO: only supports node_num=1 and node_num=all currently - torch_dist_params = '' if mode.node_num == 1 else \ + torch_dist_params = '' if 'node_num' in mode and mode.node_num == 1 else \ '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' mode_command = ( f'torchrun' @@ -158,8 +158,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): '-bind-to numa ' # bind processes to numa '{mca_list} {env_list} {command}' ).format( - host_list=f'-host localhost:{mode.proc_num}' if mode.node_num == 1 else - f'-hostfile hostfile -map-by ppr:{mode.proc_num}:node' if mode.host_list is None else '-host ' + + host_list=f'-host localhost:{mode.proc_num}' if 'node_num' in mode and mode.node_num == 1 else + f'-hostfile hostfile -map-by ppr:{mode.proc_num}:node' if 'host_list' not in mode else '-host ' + ','.join(f'{host}:{mode.proc_num}' for host in mode.host_list), mca_list=' '.join(f'-mca {k} {v}' for k, v in mode.mca.items()), env_list=' '.join( @@ -441,11 +441,11 @@ def _run_proc(self, benchmark_name, mode, vars): int: Process return code. """ mode.update(vars) - if mode.name == 'mpi' and mode.pattern: + if mode.name == 'mpi' and 'pattern' in mode: mode.env.update({'SB_MODE_SERIAL_INDEX': mode.serial_index, 'SB_MODE_PARALLEL_INDEX': mode.parallel_index}) logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank) - timeout = self._sb_benchmarks[benchmark_name].timeout + timeout = self._sb_benchmarks[benchmark_name].get('timeout', 60) if isinstance(timeout, int): timeout = max(timeout, 60) @@ -463,7 +463,7 @@ def _run_proc(self, benchmark_name, mode, vars): ansible_runner_config = self._ansible_client.get_shell_config( fcmd.format(env_list=env_list, command=self.__get_mode_command(benchmark_name, mode, timeout)) ) - if mode.name == 'mpi' and mode.node_num != 1: + if mode.name == 'mpi' and 'node_num' in mode and mode.node_num != 1: ansible_runner_config = self._ansible_client.update_mpi_config(ansible_runner_config) if isinstance(timeout, int): @@ -495,7 +495,7 @@ def run(self): ) ansible_rc = sum(rc_list) elif mode.name == 'torch.distributed' or mode.name == 'mpi': - if not mode.pattern: + if 'pattern' not in mode: ansible_rc = self._run_proc(benchmark_name, mode, {'proc_rank': 0}) else: if not os.path.exists(self._output_path / 'hostfile'): From 6ed99a13a07248600a9381450f5dd83fb008d335 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Fri, 12 Jul 2024 10:54:35 +0000 Subject: [PATCH 2/8] pass test cases --- superbench/executor/executor.py | 37 +++++++++++++++++---------------- superbench/runner/runner.py | 2 +- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index bfff5cb7c..2aa230b31 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -228,32 +228,33 @@ def exec(self): logger.warning('Monitor can not support CPU platform.') benchmark_real_name = benchmark_name.split(':')[0] - for framework in benchmark_config.frameworks or [Framework.NONE.value]: - if benchmark_real_name == 'model-benchmarks' or ( - ':' not in benchmark_name and benchmark_name.endswith('_models') - ): - for model in benchmark_config.models: - full_name = f'{benchmark_name}/{framework}-{model}' + if 'frameworks' in benchmark_config: + for framework in benchmark_config.frameworks or [Framework.NONE.value]: + if benchmark_real_name == 'model-benchmarks' or ( + ':' not in benchmark_name and benchmark_name.endswith('_models') + ): + for model in benchmark_config.models: + full_name = f'{benchmark_name}/{framework}-{model}' + logger.info('Executor is going to execute %s.', full_name) + context = BenchmarkRegistry.create_benchmark_context( + model, + platform=self.__get_platform(), + framework=Framework(framework.lower()), + parameters=self.__get_arguments(benchmark_config.parameters) + ) + result = self.__exec_benchmark(full_name, context) + benchmark_results.append(result) + else: + full_name = benchmark_name logger.info('Executor is going to execute %s.', full_name) context = BenchmarkRegistry.create_benchmark_context( - model, + benchmark_real_name, platform=self.__get_platform(), framework=Framework(framework.lower()), parameters=self.__get_arguments(benchmark_config.parameters) ) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) - else: - full_name = benchmark_name - logger.info('Executor is going to execute %s.', full_name) - context = BenchmarkRegistry.create_benchmark_context( - benchmark_real_name, - platform=self.__get_platform(), - framework=Framework(framework.lower()), - parameters=self.__get_arguments(benchmark_config.parameters) - ) - result = self.__exec_benchmark(full_name, context) - benchmark_results.append(result) if monitor: monitor.stop() diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 3d3291e6a..b62e36db8 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -225,7 +225,7 @@ def check_env(self): # pragma: no cover self._ansible_client.get_playbook_config( 'check_env.yaml', extravars={ - 'no_docker': bool(self._docker_config.skip), + 'no_docker': False if 'skip' not in self._docker_config else bool(self._docker_config.skip), 'output_dir': str(self._output_path), 'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()), } From fdc177dcc1ae5ffa002ba7ac28a5da650beac948 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Fri, 12 Jul 2024 12:19:27 +0000 Subject: [PATCH 3/8] pass executor --- superbench/config/default.yaml | 1 - superbench/executor/executor.py | 4 ++-- superbench/runner/runner.py | 4 ++-- tests/executor/test_executor.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index bd9f82010..9533806cd 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -208,7 +208,6 @@ superbench: batch_size: 1 precision: int8 megatron-gpt: - enable: false modes: - name: mpi proc_num: 1 diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index 2aa230b31..926e68170 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -71,13 +71,13 @@ def __get_enabled_benchmarks(self): Return: list: List of benchmarks which will be executed. """ - if self._sb_config.superbench.enable: + if 'enable' in self._sb_config.superbench and self._sb_config.superbench.enable: if isinstance(self._sb_config.superbench.enable, str): return [self._sb_config.superbench.enable] elif isinstance(self._sb_config.superbench.enable, (list, ListConfig)): return list(self._sb_config.superbench.enable) # TODO: may exist order issue - return [k for k, v in self._sb_benchmarks.items() if v.enable] + return [k for k, v in self._sb_benchmarks.items() if 'enable' in v and v.enable] def __get_platform(self): """Detect runninng platform by environment.""" diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index b62e36db8..cd0a5d262 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -105,12 +105,12 @@ def __get_enabled_benchmarks(self): Return: list: List of benchmarks which will be executed. """ - if self._sb_config.superbench.enable: + if 'enable' in self._sb_config.superbench and self._sb_config.superbench.enable: if isinstance(self._sb_config.superbench.enable, str): return [self._sb_config.superbench.enable] elif isinstance(self._sb_config.superbench.enable, (list, ListConfig)): return list(self._sb_config.superbench.enable) - return [k for k, v in self._sb_benchmarks.items() if v.enable] + return [k for k, v in self._sb_benchmarks.items() if 'enable' in v and v.enable] def __get_mode_command(self, benchmark_name, mode, timeout=None): """Get runner command for given mode. diff --git a/tests/executor/test_executor.py b/tests/executor/test_executor.py index a9365e6c1..984f0437e 100644 --- a/tests/executor/test_executor.py +++ b/tests/executor/test_executor.py @@ -44,7 +44,7 @@ def test_set_logger(self): def test_get_enabled_benchmarks_enable_none(self): """Test enabled benchmarks when superbench.enable is none.""" benchmarks = self.default_config.superbench.benchmarks - expected_enabled_benchmarks = [x for x in benchmarks if benchmarks[x]['enable']] + expected_enabled_benchmarks = [x for x in benchmarks if 'enable' in benchmarks[x] and benchmarks[x]['enable']] self.assertListEqual(self.executor._sb_enabled, expected_enabled_benchmarks) def test_get_enabled_benchmarks_enable_str(self): From fd5eebf9f626429bfd04423dad32cc61fa5c3543 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Fri, 12 Jul 2024 12:31:36 +0000 Subject: [PATCH 4/8] wip --- superbench/executor/executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index 926e68170..3f61385cc 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -240,7 +240,7 @@ def exec(self): model, platform=self.__get_platform(), framework=Framework(framework.lower()), - parameters=self.__get_arguments(benchmark_config.parameters) + parameters=self.__get_arguments({} if 'parameters' not in benchmark_config else benchmark_config.parameters) ) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) @@ -251,7 +251,7 @@ def exec(self): benchmark_real_name, platform=self.__get_platform(), framework=Framework(framework.lower()), - parameters=self.__get_arguments(benchmark_config.parameters) + parameters=self.__get_arguments({} if 'parameters' not in benchmark_config else benchmark_config.parameters) ) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) From a0ff2a6ff60fb094d189e5007480012b236475e4 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 15 Jul 2024 23:35:50 +0000 Subject: [PATCH 5/8] wip --- superbench/executor/executor.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index 3f61385cc..c4a812a9c 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -240,7 +240,9 @@ def exec(self): model, platform=self.__get_platform(), framework=Framework(framework.lower()), - parameters=self.__get_arguments({} if 'parameters' not in benchmark_config else benchmark_config.parameters) + parameters=self.__get_arguments( + {} if 'parameters' not in benchmark_config else benchmark_config.parameters + ) ) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) @@ -251,7 +253,9 @@ def exec(self): benchmark_real_name, platform=self.__get_platform(), framework=Framework(framework.lower()), - parameters=self.__get_arguments({} if 'parameters' not in benchmark_config else benchmark_config.parameters) + parameters=self.__get_arguments( + {} if 'parameters' not in benchmark_config else benchmark_config.parameters + ) ) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) From 32c191894d1ddda211c3ef354780225f2575ce9b Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 16 Jul 2024 01:25:52 +0000 Subject: [PATCH 6/8] limit pytest version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index efa10ef12..4cd448939 100644 --- a/setup.py +++ b/setup.py @@ -198,7 +198,7 @@ def run(self): 'pydocstyle>=5.1.1', 'pytest-cov>=2.11.1', 'pytest-subtests>=0.4.0', - 'pytest>=6.2.2', + 'pytest>=6.2.2, <=7.4.4', 'types-markdown', 'types-pkg_resources', 'types-pyyaml', From 72d56f224200b66e9c2fb995852bff1f58116f2f Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 16 Jul 2024 03:18:41 +0000 Subject: [PATCH 7/8] update runner --- superbench/runner/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index cd0a5d262..9b5bcdb35 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -70,7 +70,7 @@ def __validate_sb_config(self): # noqa: C901 if 'env' not in self._sb_config.superbench: self._sb_config.superbench.env = {} for name in self._sb_benchmarks: - if 'modes' not in self._sb_benchmarks[name].modes: + if 'modes' not in self._sb_benchmarks[name]: self._sb_benchmarks[name].modes = [] for idx, mode in enumerate(self._sb_benchmarks[name].modes): if 'env' not in mode: From 86138e2da9726c8e76c53780b89df8a906800e51 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 16 Jul 2024 05:21:36 +0000 Subject: [PATCH 8/8] update --- superbench/runner/runner.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 9b5bcdb35..cd0c8c4dc 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -206,6 +206,9 @@ def run_sys_info(self): logger.info('Runner is going to get node system info.') fcmd = "docker exec sb-workspace bash -c '{command}'" + + if 'skip' not in self._docker_config: + self._docker_config.skip = False if self._docker_config.skip: fcmd = "bash -c 'cd $SB_WORKSPACE && {command}'" ansible_runner_config = self._ansible_client.get_shell_config( @@ -225,7 +228,7 @@ def check_env(self): # pragma: no cover self._ansible_client.get_playbook_config( 'check_env.yaml', extravars={ - 'no_docker': False if 'skip' not in self._docker_config else bool(self._docker_config.skip), + 'no_docker': False if 'skip' not in self._docker_config else self._docker_config.skip, 'output_dir': str(self._output_path), 'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()), } @@ -450,6 +453,8 @@ def _run_proc(self, benchmark_name, mode, vars): timeout = max(timeout, 60) env_list = '--env-file /tmp/sb.env' + if 'skip' not in self._docker_config: + self._docker_config.skip = False if self._docker_config.skip: env_list = 'set -o allexport && source /tmp/sb.env && set +o allexport' for k, v in mode.env.items():