Improve testing for custom check functions

dodona-edu · Jun 12, 2024 · 83ef6cf · 83ef6cf
1 parent e258ce2
commit 83ef6cf
Show file tree

Hide file tree

Showing 10 changed files with 217 additions and 113 deletions.
diff --git a/tested/internationalization/nl.yaml b/tested/internationalization/nl.yaml
@@ -16,13 +16,11 @@ nl:
       runtime: "Runtime error"
       unexpected: "Onverwachte uitvoer"
     programmed:
-      student:
-        default: >-
-          Er ging iets fout op bij het evalueren van de oplossing.
-          Meld dit aan de lesgever!
-      result: "Het resultaat van de geprogrammeerde evaluatie is ongeldig:"
-      stdout: "Dit werd geproduceerd op stdout:"
-      stderr: "Dit werd geproduceerd op stderr:"
+      student: >-
+        Er ging iets fout op bij het evalueren van de oplossing.
+        Meld dit aan de lesgever!
+      stdout: "Het evalueren van de oplossing genereerde deze uitvoer op stderr:"
+      stderr: "Het evalueren van de oplossing genereerde deze uitvoer op stdout:"
     specific:
       student:
         default: >-

diff --git a/tested/oracles/programmed.py b/tested/oracles/programmed.py
@@ -68,19 +68,24 @@ def _catch_output() -> Generator[tuple[StringIO, StringIO], None, None]:
         sys.stderr = old_stderr
 
 
-def _evaluate_programmed(
-    bundle: Bundle,
-    oracle: CustomCheckOracle,
-    context: OracleContext,
-) -> BaseExecutionResult | BooleanEvalResult:
-    """
-    Run the custom evaluation. Concerning structure and execution, the custom
-    oracle is very similar to the execution of the whole evaluation. It a
-    mini-evaluation if you will.
+def _execute_custom_check_function(
+    bundle: Bundle, oracle: CustomCheckOracle, context: OracleContext
+):
     """
-    _logger.debug("Doing evaluation in Python mode.")
+    Execute a custom check function, returning the captured stdout and stderr if
+    the execution got to that point.
+
+    This function will throw various errors, depending on where in the process it
+    might fail. For example, invalid syntax will result in SyntaxErrors, but all
+    exceptions raised by the custom oracles also need to be caught.
 
-    # Create a configs bundle for the language of the oracle.
+    :param bundle: The bundle of the original execution.
+    :param oracle: The oracle that is executing.
+    :param context: The context of said oracle.
+
+    :return: A tuple with (result, stdout, stderr), but all can be None.
+    """
+    # Create a config bundle for Python, the programming language of the oracle.
     eval_bundle = create_bundle(bundle.config, bundle.out, bundle.suite, "python")
 
     # Path to the oracle.
@@ -102,33 +107,53 @@ def _evaluate_programmed(
         "__tested_context__": ConvertedOracleContext.from_context(eval_bundle, context),
     }
     exec("import sys\n" "sys.modules['evaluation_utils'] = __tested_test__", global_env)
-    # Make the oracle available.
+
+    # Make the oracle available. This will fail on syntax errors.
     exec(evaluator_code, global_env)
 
-    # Since we pass a class value, we don't want to
+    # Create the function we will call.
     check_function_call = FunctionCall(
         type=FunctionType.FUNCTION,
         name=oracle.function.name,
         arguments=[Identifier("__tested_context__"), *oracle.arguments],
     )
+    # The actual code for calling the function.
     literal_function_call = generate_statement(eval_bundle, check_function_call)
 
+    # Call the function while intercepting all output.
+    with _catch_output() as (stdout_, stderr_):
+        exec(f"__tested_test__result = {literal_function_call}", global_env)
+    result_ = cast(BooleanEvalResult | None, global_env["__tested_test__result"])
+    stdout_ = stdout_.getvalue()
+    stderr_ = stderr_.getvalue()
+
+    return result_, stdout_, stderr_
+
+
+def _evaluate_programmed(
+    bundle: Bundle,
+    oracle: CustomCheckOracle,
+    context: OracleContext,
+) -> BaseExecutionResult | BooleanEvalResult:
+    """
+    Run the custom evaluation. This will call a function to do the execution, but
+    mainly provides error handling.
+    """
+
+    result_ = None
+    stdout_ = None
+    stderr_ = None
     messages = []
-    # noinspection PyBroadException
     try:
-        with _catch_output() as (stdout_, stderr_):
-            exec(f"__tested_test__result = {literal_function_call}", global_env)
-        result_ = cast(BooleanEvalResult | None, global_env["__tested_test__result"])
-        stdout_ = stdout_.getvalue()
-        stderr_ = stderr_.getvalue()
-    except Exception as e:
+        result_, stdout_, stderr_ = _execute_custom_check_function(
+            bundle, oracle, context
+        )
+    except SyntaxError as e:
+        # The oracle might be rubbish, so handle any exception.
         _logger.exception(e)
-        result_ = None
-        stdout_ = None
-        stderr_ = None
         messages.append(
             ExtendedMessage(
-                description="The custom check oracle failed with the following exception:",
+                description="The custom check oracle failed with the following syntax error:",
                 format="text",
                 permission=Permission.STAFF,
             )
@@ -137,43 +162,31 @@ def _evaluate_programmed(
         messages.append(
             ExtendedMessage(description=tb, format="code", permission=Permission.STAFF)
         )
-
-    if stdout_:
-        messages.append(
-            ExtendedMessage(
-                description=get_i18n_string("judge.programmed.produced.stdout"),
-                format="text",
-            )
-        )
-        messages.append(ExtendedMessage(description=stdout_, format="code"))
-    if stderr_:
+    except Exception as e:
+        _logger.exception(e)
         messages.append(
             ExtendedMessage(
-                description=get_i18n_string("judge.programmed.produced.stderr"),
+                description="The custom check oracle failed with the following exception:",
                 format="text",
-                permission=Permission.STUDENT,
+                permission=Permission.STAFF,
             )
         )
+        tb = traceback.format_exc()
         messages.append(
-            ExtendedMessage(
-                description=stderr_, format="code", permission=Permission.STAFF
-            )
+            ExtendedMessage(description=tb, format="code", permission=Permission.STAFF)
         )
 
+    if stdout_:
+        messages.append(get_i18n_string("judge.programmed.produced.stdout"))
+        messages.append(ExtendedMessage(description=stdout_, format="code"))
+    if stderr_:
+        messages.append(get_i18n_string("judge.programmed.produced.stderr"))
+        messages.append(ExtendedMessage(description=stderr_, format="code"))
+
     # If the result is None, the oracle is broken.
     if result_ is None:
-        messages.append(
-            ExtendedMessage(
-                description=get_i18n_string("judge.programmed.student"), format="text"
-            )
-        )
-        messages.append(
-            ExtendedMessage(
-                description=get_i18n_string("judge.programmed.failed"),
-                format="text",
-                permission=Permission.STAFF,
-            )
-        )
+        messages.append(get_i18n_string("judge.programmed.student"))
+        messages.append("The custom check oracle did not produce a valid return value.")
         return BooleanEvalResult(
             result=Status.INTERNAL_ERROR,
             readable_expected=None,

diff --git a/tests/exercises/echo-function/evaluation/evaluator.py b/tests/exercises/echo-function/evaluation/evaluator.py
@@ -18,3 +18,7 @@ def evaluate_value_dsl(context):
         dsl_expected="{5, 5}",
         dsl_actual="{4, 4}"
     )
+
+
+def evaluate_runtime_crash(context):
+    return len(context) / 0
diff --git a/tests/exercises/echo-function/evaluation/evaluator_syntax_error.py b/tests/exercises/echo-function/evaluation/evaluator_syntax_error.py
@@ -0,0 +1,6 @@
+# noinspection PyUnresolvedReferences
+from evaluation_utils import EvaluationResult, Message
+
+
+evaluate(context):
+    return len(context) / 0
diff --git a/tests/exercises/echo-function/evaluation/programmed_crash.yaml b/tests/exercises/echo-function/evaluation/programmed_crash.yaml
@@ -0,0 +1,9 @@
+- tab: "My tab"
+  contexts:
+    - testcases:
+        - expression: 'echo("input-1")'
+          return: !oracle
+            oracle: "custom_check"
+            file: "evaluator.py"
+            name: "evaluate_runtime_crash"
+            value: "input-2"
diff --git a/tests/exercises/echo-function/evaluation/programmed_missing.yaml b/tests/exercises/echo-function/evaluation/programmed_missing.yaml
@@ -0,0 +1,9 @@
+- tab: "My tab"
+  contexts:
+    - testcases:
+        - expression: 'echo("input-1")'
+          return: !oracle
+            oracle: "custom_check"
+            file: "evaluator_syntax_error.py"
+            name: "this_does_not_exist"
+            value: "input-2"
diff --git a/tests/exercises/echo-function/evaluation/programmed_syntax_error.yaml b/tests/exercises/echo-function/evaluation/programmed_syntax_error.yaml
@@ -0,0 +1,9 @@
+- tab: "My tab"
+  contexts:
+    - testcases:
+        - expression: 'echo("input-1")'
+          return: !oracle
+            oracle: "custom_check"
+            file: "evaluator_syntax_error.py"
+            name: "evaluate"
+            value: "input-2"
diff --git a/tests/language_markers.py b/tests/language_markers.py
@@ -0,0 +1,15 @@
+import pytest
+
+COMPILE_LANGUAGES = [
+    "python",
+    "java",
+    "c",
+    "kotlin",
+    pytest.param("haskell", marks=pytest.mark.haskell),
+    "csharp",
+]
+ALL_SPECIFIC_LANGUAGES = COMPILE_LANGUAGES + [
+    "javascript",
+    pytest.param("runhaskell", marks=pytest.mark.haskell),
+]
+ALL_LANGUAGES = ALL_SPECIFIC_LANGUAGES + ["bash"]
diff --git a/tests/test_functionality.py b/tests/test_functionality.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 
 import pytest
+from language_markers import ALL_LANGUAGES, ALL_SPECIFIC_LANGUAGES
 
 from tested.configs import create_bundle
 from tested.datatypes import BasicBooleanTypes, BasicNumericTypes, BasicStringTypes
@@ -29,20 +30,6 @@
 from tested.testsuite import Context, MainInput, Suite, Tab, Testcase, TextData
 from tests.manual_utils import assert_valid_output, configuration, execute_config
 
-COMPILE_LANGUAGES = [
-    "python",
-    "java",
-    "c",
-    "kotlin",
-    pytest.param("haskell", marks=pytest.mark.haskell),
-    "csharp",
-]
-ALL_SPECIFIC_LANGUAGES = COMPILE_LANGUAGES + [
-    "javascript",
-    pytest.param("runhaskell", marks=pytest.mark.haskell),
-]
-ALL_LANGUAGES = ALL_SPECIFIC_LANGUAGES + ["bash"]
-
 quotes = {
     "python": "'",
     "java": '"',
@@ -94,31 +81,6 @@ def test_io_exercise_wrong(language: str, tmp_path: Path, pytestconfig):
     assert updates.find_status_enum() == ["wrong"]
 
 
-@pytest.mark.parametrize("language", ALL_LANGUAGES)
-def test_simple_programmed_eval(language: str, tmp_path: Path, pytestconfig):
-    conf = configuration(
-        pytestconfig,
-        "echo",
-        language,
-        tmp_path,
-        "one-programmed-correct.tson",
-        "correct",
-    )
-    result = execute_config(conf)
-    updates = assert_valid_output(result, pytestconfig)
-    assert updates.find_status_enum() == ["correct"]
-
-
-@pytest.mark.parametrize("language", ALL_LANGUAGES)
-def test_simple_programmed_eval_wrong(language: str, tmp_path: Path, pytestconfig):
-    conf = configuration(
-        pytestconfig, "echo", language, tmp_path, "one-programmed-wrong.tson", "correct"
-    )
-    result = execute_config(conf)
-    updates = assert_valid_output(result, pytestconfig)
-    assert updates.find_status_enum() == ["wrong"]
-
-
 @pytest.mark.parametrize("language", ALL_LANGUAGES)
 def test_io_function_exercise(language: str, tmp_path: Path, pytestconfig):
     conf = configuration(
@@ -278,22 +240,6 @@ def test_specific_evaluation(language: str, tmp_path: Path, pytestconfig):
     assert len(updates.find_all("append-message")) == 2
 
 
-@pytest.mark.parametrize("language", ALL_LANGUAGES)
-def test_programmed_evaluation(language: str, tmp_path: Path, pytestconfig):
-    conf = configuration(
-        pytestconfig,
-        "echo-function",
-        language,
-        tmp_path,
-        "programmed.tson",
-        "correct",
-    )
-    result = execute_config(conf)
-    updates = assert_valid_output(result, pytestconfig)
-    assert updates.find_status_enum() == ["correct"]
-    assert len(updates.find_all("append-message"))
-
-
 @pytest.mark.parametrize(
     "lang",
     [