diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py index 97045e5f..67d0f572 100755 --- a/lmms_eval/__main__.py +++ b/lmms_eval/__main__.py @@ -321,20 +321,22 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: for args in args_list: try: - if is_main_process and args.wandb_args: # thoughtfully we should only init wandb once, instead of multiple ranks to avoid network traffics and unwanted behaviors. - wandb_logger = WandbLogger(args) + # if is_main_process and args.wandb_args: # thoughtfully we should only init wandb once, instead of multiple ranks to avoid network traffics and unwanted behaviors. + # wandb_logger = WandbLogger() results, samples = cli_evaluate_single(args) results_list.append(results) accelerator.wait_for_everyone() if is_main_process and args.wandb_args: - wandb_logger.post_init(results) - wandb_logger.log_eval_result() - if args.wandb_log_samples and samples is not None: - wandb_logger.log_eval_samples(samples) - - wandb_logger.finish() + try: + wandb_logger.post_init(results) + wandb_logger.log_eval_result() + if args.wandb_log_samples and samples is not None: + wandb_logger.log_eval_samples(samples) + except Exception as e: + eval_logger.info(f"Logging to Weights and Biases failed due to {e}") + # wandb_logger.finish() except Exception as e: if args.verbosity == "DEBUG": @@ -349,6 +351,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: if results is not None: print_results(args, results) + if args.wandb_args: + wandb_logger.run.finish() + def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None: selected_task_list = args.tasks.split(",") if args.tasks else None