Skip to content

Commit

Permalink
Improve testing for custom check functions
Browse files Browse the repository at this point in the history
  • Loading branch information
niknetniko committed Jun 12, 2024
1 parent e258ce2 commit 83ef6cf
Show file tree
Hide file tree
Showing 10 changed files with 217 additions and 113 deletions.
12 changes: 5 additions & 7 deletions tested/internationalization/nl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@ nl:
runtime: "Runtime error"
unexpected: "Onverwachte uitvoer"
programmed:
student:
default: >-
Er ging iets fout op bij het evalueren van de oplossing.
Meld dit aan de lesgever!
result: "Het resultaat van de geprogrammeerde evaluatie is ongeldig:"
stdout: "Dit werd geproduceerd op stdout:"
stderr: "Dit werd geproduceerd op stderr:"
student: >-
Er ging iets fout op bij het evalueren van de oplossing.
Meld dit aan de lesgever!
stdout: "Het evalueren van de oplossing genereerde deze uitvoer op stderr:"
stderr: "Het evalueren van de oplossing genereerde deze uitvoer op stdout:"
specific:
student:
default: >-
Expand Down
115 changes: 64 additions & 51 deletions tested/oracles/programmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,24 @@ def _catch_output() -> Generator[tuple[StringIO, StringIO], None, None]:
sys.stderr = old_stderr


def _evaluate_programmed(
bundle: Bundle,
oracle: CustomCheckOracle,
context: OracleContext,
) -> BaseExecutionResult | BooleanEvalResult:
"""
Run the custom evaluation. Concerning structure and execution, the custom
oracle is very similar to the execution of the whole evaluation. It a
mini-evaluation if you will.
def _execute_custom_check_function(
bundle: Bundle, oracle: CustomCheckOracle, context: OracleContext
):
"""
_logger.debug("Doing evaluation in Python mode.")
Execute a custom check function, returning the captured stdout and stderr if
the execution got to that point.
This function will throw various errors, depending on where in the process it
might fail. For example, invalid syntax will result in SyntaxErrors, but all
exceptions raised by the custom oracles also need to be caught.
# Create a configs bundle for the language of the oracle.
:param bundle: The bundle of the original execution.
:param oracle: The oracle that is executing.
:param context: The context of said oracle.
:return: A tuple with (result, stdout, stderr), but all can be None.
"""
# Create a config bundle for Python, the programming language of the oracle.
eval_bundle = create_bundle(bundle.config, bundle.out, bundle.suite, "python")

# Path to the oracle.
Expand All @@ -102,33 +107,53 @@ def _evaluate_programmed(
"__tested_context__": ConvertedOracleContext.from_context(eval_bundle, context),
}
exec("import sys\n" "sys.modules['evaluation_utils'] = __tested_test__", global_env)
# Make the oracle available.

# Make the oracle available. This will fail on syntax errors.
exec(evaluator_code, global_env)

# Since we pass a class value, we don't want to
# Create the function we will call.
check_function_call = FunctionCall(
type=FunctionType.FUNCTION,
name=oracle.function.name,
arguments=[Identifier("__tested_context__"), *oracle.arguments],
)
# The actual code for calling the function.
literal_function_call = generate_statement(eval_bundle, check_function_call)

# Call the function while intercepting all output.
with _catch_output() as (stdout_, stderr_):
exec(f"__tested_test__result = {literal_function_call}", global_env)
result_ = cast(BooleanEvalResult | None, global_env["__tested_test__result"])
stdout_ = stdout_.getvalue()
stderr_ = stderr_.getvalue()

return result_, stdout_, stderr_


def _evaluate_programmed(
bundle: Bundle,
oracle: CustomCheckOracle,
context: OracleContext,
) -> BaseExecutionResult | BooleanEvalResult:
"""
Run the custom evaluation. This will call a function to do the execution, but
mainly provides error handling.
"""

result_ = None
stdout_ = None
stderr_ = None
messages = []
# noinspection PyBroadException
try:
with _catch_output() as (stdout_, stderr_):
exec(f"__tested_test__result = {literal_function_call}", global_env)
result_ = cast(BooleanEvalResult | None, global_env["__tested_test__result"])
stdout_ = stdout_.getvalue()
stderr_ = stderr_.getvalue()
except Exception as e:
result_, stdout_, stderr_ = _execute_custom_check_function(
bundle, oracle, context
)
except SyntaxError as e:
# The oracle might be rubbish, so handle any exception.
_logger.exception(e)
result_ = None
stdout_ = None
stderr_ = None
messages.append(
ExtendedMessage(
description="The custom check oracle failed with the following exception:",
description="The custom check oracle failed with the following syntax error:",
format="text",
permission=Permission.STAFF,
)
Expand All @@ -137,43 +162,31 @@ def _evaluate_programmed(
messages.append(
ExtendedMessage(description=tb, format="code", permission=Permission.STAFF)
)

if stdout_:
messages.append(
ExtendedMessage(
description=get_i18n_string("judge.programmed.produced.stdout"),
format="text",
)
)
messages.append(ExtendedMessage(description=stdout_, format="code"))
if stderr_:
except Exception as e:
_logger.exception(e)
messages.append(
ExtendedMessage(
description=get_i18n_string("judge.programmed.produced.stderr"),
description="The custom check oracle failed with the following exception:",
format="text",
permission=Permission.STUDENT,
permission=Permission.STAFF,
)
)
tb = traceback.format_exc()
messages.append(
ExtendedMessage(
description=stderr_, format="code", permission=Permission.STAFF
)
ExtendedMessage(description=tb, format="code", permission=Permission.STAFF)
)

if stdout_:
messages.append(get_i18n_string("judge.programmed.produced.stdout"))
messages.append(ExtendedMessage(description=stdout_, format="code"))
if stderr_:
messages.append(get_i18n_string("judge.programmed.produced.stderr"))
messages.append(ExtendedMessage(description=stderr_, format="code"))

# If the result is None, the oracle is broken.
if result_ is None:
messages.append(
ExtendedMessage(
description=get_i18n_string("judge.programmed.student"), format="text"
)
)
messages.append(
ExtendedMessage(
description=get_i18n_string("judge.programmed.failed"),
format="text",
permission=Permission.STAFF,
)
)
messages.append(get_i18n_string("judge.programmed.student"))
messages.append("The custom check oracle did not produce a valid return value.")
return BooleanEvalResult(
result=Status.INTERNAL_ERROR,
readable_expected=None,
Expand Down
4 changes: 4 additions & 0 deletions tests/exercises/echo-function/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@ def evaluate_value_dsl(context):
dsl_expected="{5, 5}",
dsl_actual="{4, 4}"
)


def evaluate_runtime_crash(context):
return len(context) / 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# noinspection PyUnresolvedReferences
from evaluation_utils import EvaluationResult, Message


evaluate(context):
return len(context) / 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- tab: "My tab"
contexts:
- testcases:
- expression: 'echo("input-1")'
return: !oracle
oracle: "custom_check"
file: "evaluator.py"
name: "evaluate_runtime_crash"
value: "input-2"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- tab: "My tab"
contexts:
- testcases:
- expression: 'echo("input-1")'
return: !oracle
oracle: "custom_check"
file: "evaluator_syntax_error.py"
name: "this_does_not_exist"
value: "input-2"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- tab: "My tab"
contexts:
- testcases:
- expression: 'echo("input-1")'
return: !oracle
oracle: "custom_check"
file: "evaluator_syntax_error.py"
name: "evaluate"
value: "input-2"
15 changes: 15 additions & 0 deletions tests/language_markers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest

COMPILE_LANGUAGES = [
"python",
"java",
"c",
"kotlin",
pytest.param("haskell", marks=pytest.mark.haskell),
"csharp",
]
ALL_SPECIFIC_LANGUAGES = COMPILE_LANGUAGES + [
"javascript",
pytest.param("runhaskell", marks=pytest.mark.haskell),
]
ALL_LANGUAGES = ALL_SPECIFIC_LANGUAGES + ["bash"]
56 changes: 1 addition & 55 deletions tests/test_functionality.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pathlib import Path

import pytest
from language_markers import ALL_LANGUAGES, ALL_SPECIFIC_LANGUAGES

from tested.configs import create_bundle
from tested.datatypes import BasicBooleanTypes, BasicNumericTypes, BasicStringTypes
Expand All @@ -29,20 +30,6 @@
from tested.testsuite import Context, MainInput, Suite, Tab, Testcase, TextData
from tests.manual_utils import assert_valid_output, configuration, execute_config

COMPILE_LANGUAGES = [
"python",
"java",
"c",
"kotlin",
pytest.param("haskell", marks=pytest.mark.haskell),
"csharp",
]
ALL_SPECIFIC_LANGUAGES = COMPILE_LANGUAGES + [
"javascript",
pytest.param("runhaskell", marks=pytest.mark.haskell),
]
ALL_LANGUAGES = ALL_SPECIFIC_LANGUAGES + ["bash"]

quotes = {
"python": "'",
"java": '"',
Expand Down Expand Up @@ -94,31 +81,6 @@ def test_io_exercise_wrong(language: str, tmp_path: Path, pytestconfig):
assert updates.find_status_enum() == ["wrong"]


@pytest.mark.parametrize("language", ALL_LANGUAGES)
def test_simple_programmed_eval(language: str, tmp_path: Path, pytestconfig):
conf = configuration(
pytestconfig,
"echo",
language,
tmp_path,
"one-programmed-correct.tson",
"correct",
)
result = execute_config(conf)
updates = assert_valid_output(result, pytestconfig)
assert updates.find_status_enum() == ["correct"]


@pytest.mark.parametrize("language", ALL_LANGUAGES)
def test_simple_programmed_eval_wrong(language: str, tmp_path: Path, pytestconfig):
conf = configuration(
pytestconfig, "echo", language, tmp_path, "one-programmed-wrong.tson", "correct"
)
result = execute_config(conf)
updates = assert_valid_output(result, pytestconfig)
assert updates.find_status_enum() == ["wrong"]


@pytest.mark.parametrize("language", ALL_LANGUAGES)
def test_io_function_exercise(language: str, tmp_path: Path, pytestconfig):
conf = configuration(
Expand Down Expand Up @@ -278,22 +240,6 @@ def test_specific_evaluation(language: str, tmp_path: Path, pytestconfig):
assert len(updates.find_all("append-message")) == 2


@pytest.mark.parametrize("language", ALL_LANGUAGES)
def test_programmed_evaluation(language: str, tmp_path: Path, pytestconfig):
conf = configuration(
pytestconfig,
"echo-function",
language,
tmp_path,
"programmed.tson",
"correct",
)
result = execute_config(conf)
updates = assert_valid_output(result, pytestconfig)
assert updates.find_status_enum() == ["correct"]
assert len(updates.find_all("append-message"))


@pytest.mark.parametrize(
"lang",
[
Expand Down
Loading

0 comments on commit 83ef6cf

Please sign in to comment.