paris-saclay-cds · maikia · Jan 6, 2021 · Jan 7, 2021 · Jan 7, 2021 · Jan 7, 2021
diff --git a/ramp-engine/ramp_engine/aws/api.py b/ramp-engine/ramp_engine/aws/api.py
@@ -795,9 +795,14 @@ def _is_ready(config, instance_id):
 
 def _training_finished(config, instance_id, submission_name):
     """
-    Return True if a submission has finished training
+    Return True if a submission has finished training (if the screen no longer
+    exists on the ec2 instance and if the bagged_scores.csv file was saved)
     """
-    return not _has_screen(config, instance_id, submission_name)
+    has_screen = _has_screen(config, instance_id, submission_name)
+    # this can only work if the training was successful
+    has_score_file = _has_error_or_score_file(
+        config, instance_id, submission_name)
+    return not has_screen and has_score_file
 
 
 def _training_successful(config, instance_id, submission_name,
@@ -844,6 +849,25 @@ def _has_screen(config, instance_id, screen_name):
     return nb > 0
 
 
+def _has_error_or_score_file(config, instance_id, screen_name):
+    """
+    Return True if a 'bagged_scores.csv' file exists (submission terminated
+    with a success) or if log file (submisssion terminated with error) exists
+    but the directory with fold_0 does not exist on the ec2
+    instance
+    """
+    submission_path = os.path.join(
+        config['remote_ramp_kit_folder'],
+        'submissions', screen_name, 'training_output')
+    cmd = ("bash -c '"
+           f"if [ -f {submission_path}/bagged_scores.csv ] || "
+           f"[ -f {submission_path}/fold_*/error.txt ]; "
+           "then echo 1; else echo 0; fi'")
+    is_log_or_score = _run(config, instance_id, cmd, return_output=True)
+
+    return int(is_log_or_score)
+
+
 def _tag_instance_by_submission(config, instance_id, submission_name):
     """
     Add tags to an instance with infos from the submission to know which

diff --git a/ramp-engine/ramp_engine/tests/test_aws.py b/ramp-engine/ramp_engine/tests/test_aws.py
@@ -333,6 +333,49 @@ class DummyInstance:
     assert 'Adding the submission back to the queue' in caplog.text
 
 
+@mock.patch('ramp_engine.aws.api.is_spot_terminated')
+@mock.patch('ramp_engine.aws.api.launch_train')
+@mock.patch('ramp_engine.aws.api._has_screen')
+@mock.patch('ramp_engine.aws.api._has_error_or_score_file')
+def test_not_finished_until_bagged_or_log_saved(has_score_file, has_screen,
+                                                launch_train,
+                                                spot_terminated,
+                                                caplog):
+    """ checks if the submission is considered finished correctly only if the
+        submission has finished correctly and the bagged_scores.csv file is
+        saved or it had some errors which appear in the log file """
+    class DummyInstance:
+        id = 1
+    launch_train.return_value = 0
+    has_screen.return_value = True
+    has_score_file.return_value = False
+
+    # setup the AWS worker
+    event_config = read_config(ramp_aws_config_template())['worker']
+
+    worker = AWSWorker(event_config, submission='starting_kit_local')
+    worker.config = event_config
+    worker.submission = 'dummy submissions'
+    worker.instance = DummyInstance
+
+    # set the submission did not yet finish training
+    spot_terminated.return_value = False
+
+    worker.launch_submission()
+    assert worker.status == 'running'
+    assert caplog.text == ''
+
+    # set screen is no longer there
+    has_screen.return_value = False
+    assert worker.status == 'running'
+    assert caplog.text == ''
+
+    # set that the score file was saved
+    has_score_file.return_value = True
+    assert worker.status == 'finished'
+    assert caplog.text == ''
+
+
 def test_aws_worker():
     if not os.path.isfile(os.path.join(HERE, 'config.yml')):
         pytest.skip("Only for local tests for now")