-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Created MultipleChoiceQuestion
and MultipleChoiceEvaluation
#157
Conversation
85a5dad
to
3b4fc47
Compare
⚡️ Codeflash found optimizations for this PR📄
|
Test | Status | Details |
---|---|---|
⚙️ Existing Unit Tests | ✅ 5 Passed | See below |
🌀 Generated Regression Tests | ✅ 24 Passed | See below |
⏪ Replay Tests | 🔘 None Found | |
🔎 Concolic Coverage Tests | 🔘 None Found | |
📊 Coverage | undefined |
⚙️ Existing Unit Tests Details
Click to view details
- test_utils.py
🌀 Generated Regression Tests Details
Click to view details
from collections.abc import Sequence
from enum import StrEnum
from typing import Self
# imports
import pytest # used for our unit tests
from aviary.utils import MultipleChoiceEvaluation
# unit tests
def test_single_correct():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.CORRECT]
)
def test_single_incorrect():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.INCORRECT]
)
def test_mixed_evaluations():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.CORRECT, MultipleChoiceEvaluation.INCORRECT, MultipleChoiceEvaluation.CORRECT]
)
def test_mixed_with_unsure():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.CORRECT, MultipleChoiceEvaluation.UNSURE, MultipleChoiceEvaluation.INCORRECT]
)
def test_empty_input():
with pytest.raises(ZeroDivisionError):
MultipleChoiceEvaluation.calculate_accuracy_precision([])
def test_large_number_of_evaluations():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.CORRECT] * 1000 + [MultipleChoiceEvaluation.INCORRECT] * 1000
)
def test_large_number_with_unsure():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.CORRECT] * 1000 + [MultipleChoiceEvaluation.INCORRECT] * 1000 + [MultipleChoiceEvaluation.UNSURE] * 1000
)
def test_typical_classroom_scenario():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.CORRECT] * 20 + [MultipleChoiceEvaluation.INCORRECT] * 5 + [MultipleChoiceEvaluation.UNSURE] * 5
)
def test_all_correct():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.CORRECT] * 10
)
def test_all_incorrect():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.INCORRECT] * 10
)
def test_non_string_non_enum_inputs():
with pytest.raises(ValueError):
MultipleChoiceEvaluation.calculate_accuracy_precision([None, 123, 45.6])
def test_minimum_valid_input():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.CORRECT]
)
def test_maximum_valid_input():
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(
[MultipleChoiceEvaluation.CORRECT] * 10**6
)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
from collections.abc import Sequence
from enum import StrEnum
from typing import Self
# imports
import pytest # used for our unit tests
from aviary.utils import MultipleChoiceEvaluation
# unit tests
def test_single_correct():
"""Test with a single correct evaluation"""
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision([MultipleChoiceEvaluation.CORRECT])
def test_single_incorrect():
"""Test with a single incorrect evaluation"""
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision([MultipleChoiceEvaluation.INCORRECT])
def test_mixed_correct_incorrect():
"""Test with a mix of correct and incorrect evaluations"""
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision([MultipleChoiceEvaluation.CORRECT, MultipleChoiceEvaluation.INCORRECT])
def test_mixed_correct_incorrect_unsure():
"""Test with a mix of correct, incorrect, and unsure evaluations"""
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision([MultipleChoiceEvaluation.CORRECT, MultipleChoiceEvaluation.INCORRECT, MultipleChoiceEvaluation.UNSURE])
def test_empty_input():
"""Test with an empty input sequence"""
with pytest.raises(ZeroDivisionError):
MultipleChoiceEvaluation.calculate_accuracy_precision([])
def test_invalid_string_input():
"""Test with an invalid string input"""
with pytest.raises(ValueError):
MultipleChoiceEvaluation.calculate_accuracy_precision(['INVALID'])
def test_mixed_valid_invalid_inputs():
"""Test with mixed valid and invalid inputs"""
with pytest.raises(ValueError):
MultipleChoiceEvaluation.calculate_accuracy_precision([MultipleChoiceEvaluation.CORRECT, 'INVALID'])
def test_large_number_of_evaluations():
"""Test with a large number of evaluations"""
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision([MultipleChoiceEvaluation.CORRECT] * 1000 + [MultipleChoiceEvaluation.INCORRECT] * 1000)
def test_large_number_of_mixed_evaluations():
"""Test with a large number of mixed evaluations"""
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision([MultipleChoiceEvaluation.CORRECT] * 1000 + [MultipleChoiceEvaluation.INCORRECT] * 500 + [MultipleChoiceEvaluation.UNSURE] * 500)
def test_very_large_input():
"""Test with a very large input"""
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision([MultipleChoiceEvaluation.CORRECT] * 10**6 + [MultipleChoiceEvaluation.INCORRECT] * 10**6)
def test_stress_test_with_random_evaluations():
"""Stress test with random evaluations"""
import random
evaluations = random.choices([MultipleChoiceEvaluation.CORRECT, MultipleChoiceEvaluation.INCORRECT, MultipleChoiceEvaluation.UNSURE], k=1000)
codeflash_output = MultipleChoiceEvaluation.calculate_accuracy_precision(evaluations)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
@@ -88,6 +108,23 @@ def is_coroutine_callable(obj) -> bool: | |||
return False | |||
|
|||
|
|||
async def run_prompt( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
worth using the llm-client code here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes 100% agreed, but ima leave that for another PR.
I don't think @maykcaldas hasn't pulled llm-client
into this repo's llm
extra yet. Tho one comment is llm-client
is a bit heavier than just litellm
, so maybe we should allow for both routes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm -- couple minor comments
This PR generalizes https://github.com/Future-House/paper-qa/blob/a630d922d76551e9cd0258c62e2012cd5d459937/paperqa/litqa.py by creating a common multiple choice class that other environments besides
paper-qa
can use. It also aims to: