Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transformers model as Judge #174

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 57 additions & 39 deletions src/lighteval/metrics/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,36 +25,36 @@
import json
import re
import time
from typing import Optional
from typing import Any, Optional

from lighteval.logging.hierarchical_logger import hlog_warn
from lighteval.utils import NO_OPENAI_ERROR_MSG, is_openai_available


class JudgeOpenAI:
class JudgeLM:
"""
A class representing a judge for evaluating answers using the OpenAI API.
A class representing a judge for evaluating answers using the Transformers library.

Args:
model (str): The name of the OpenAI model to use.
model (str): The name of the model to use.
seed (int): The seed value for generating random responses.
temperature (float): The temperature value for controlling the randomness of the responses.
templates_path (str): The path to the JSON file containing the templates for prompts.

Attributes:
client: An instance of the OpenAI client.
model (str): The name of the OpenAI model.
model (str): The name of the model.
seed (int): The seed value, passed to the API when generating responses.
temperature (float): The temperature value, passed to the API when generating responses.
templates (dict): A dictionary containing the templates for prompts.
judge_type (str): Judge type based on used model.
one_score_pattern (re.Pattern): A regular expression pattern for extracting scores from the response.
one_score_pattern_backup (re.Pattern): A backup regular expression pattern for extracting scores.
API_MAX_RETRY (int): The maximum number of API retries.
API_RETRY_SLEEP (int): The sleep time between API retries.
max_tokens (int): The maximum number of tokens allowed in the response.

Methods:
evaluate_answer: Evaluates an answer using the OpenAI API.
evaluate_answer: Evaluates an answer using the OpenAI API or Transformers library.
__get_prompts_multi_turn: Generates prompts for multi-turn conversations.
__get_prompts_single_turn: Generates prompts for single-turn conversations.
__process_judge_response: Processes the judge's response and extracts the score.
Expand All @@ -66,7 +66,8 @@ def __init__(
seed: int,
temperature: float,
templates_path: str,
openai_api_key: str,
judge_type: str,
openai_api_key: Optional[str] = None,
multi_turn: bool = False,
):
self.client = None # loaded lazily
Expand All @@ -90,27 +91,41 @@ def __init__(
self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]")
self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")

self.API_MAX_RETRY = 16
self.API_RETRY_SLEEP = 10
self.max_tokens = 2048
if judge_type == "openai":
self.client = OpenAI(api_key=openai_api_key)
self.API_MAX_RETRY = 16
self.API_RETRY_SLEEP = 10
self.max_tokens = 2048
else:
transformers_model = AutoModelForCausalLM.from_pretrained(
model, torch_dtype="auto", trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model)
self.pipe = pipeline(
"text-generation",
model=transformers_model,
tokenizer=tokenizer,
)
self.generation_args = {
"max_new_tokens": 500,
"return_full_text": False,
"temperature": temperature,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

temperature is not needed if do_sample is set to False

"do_sample": False,
}

def evaluate_answer(
self, questions: list[str], answers: list[str], references: list[str]
) -> tuple[int, list[dict[str, str]], str]:
) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]:
"""
Evaluates an answer using the OpenAI API.
Evaluates an answer using the OpenAI API or Transformers.

Args:
questions (list[str]): A list of questions (can be a list because of multi-turn conversations)
answers (list[str]): A list of answers, one for each question.
references (list[str]): A list of reference answers, one for each question (sometimes not available)
single_turn (bool): Indicates whether the conversation is single-turn or multi-turn.

Returns:
A tuple containing the score, prompts, and judgment.

Raises:
Exception: If an error occurs during the API call.
"""
if self.client is None:
if not is_openai_available():
Expand All @@ -122,7 +137,7 @@ def evaluate_answer(

prompts = [
self.__get_prompts_single_turn(
questions[0], answers[0], references[0] if references is not None and len(references) > 0 else None
questions[0], answers[0], references[0] if references and len(references) > 0 else None
)
]

Expand All @@ -132,32 +147,35 @@ def evaluate_answer(
)
prompts.append(prompts_multi_turn)

responses = []
judgments = []
for prompt in prompts:
for _ in range(self.API_MAX_RETRY):
try:
response = self.client.chat.completions.create(
model=self.model,
seed=self.seed,
temperature=self.temperature,
messages=prompt,
max_tokens=self.max_tokens,
n=1,
)
responses.append(response)
break
except Exception as e:
hlog_warn(f"{type(e), e}")
time.sleep(self.API_RETRY_SLEEP)

if len(responses) == 0:
raise Exception("Failed to get response from the API")

judgments = [response.choices[0].message.content for response in responses]
if hasattr(self, "client"):
response = self.__call_openai_api(prompt)
else:
response = self.pipe(prompt)[0]["generated_text"]
judgments.append(response)

scores = [self.__process_judge_response(judgment) for judgment in judgments]

return scores, prompts, judgments

def __call_openai_api(self, prompt):
for _ in range(self.API_MAX_RETRY):
try:
response = self.client.chat.completions.create(
model=self.model,
seed=self.seed,
temperature=self.temperature,
messages=prompt,
max_tokens=self.max_tokens,
n=1,
)
return response.choices[0].message.content
except Exception as e:
hlog_warn(f"{type(e), e}")
time.sleep(self.API_RETRY_SLEEP)
raise Exception("Failed to get response from the API")

def __get_prompts_multi_turn(
self, questions: list[str], answers: list[str], references: Optional[list[str]]
) -> list[dict[str, str]]:
Expand Down
16 changes: 12 additions & 4 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

import nltk
import numpy as np
from huggingface_hub import HfApi
from nltk.metrics.distance import edit_distance
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer
Expand All @@ -40,7 +41,7 @@
from lighteval.metrics.imports.bert_scorer import BERTScorer
from lighteval.metrics.imports.data_stats_metric import DataStatsMetric
from lighteval.metrics.imports.summac import SummaCZS
from lighteval.metrics.llm_as_judge import JudgeOpenAI
from lighteval.metrics.llm_as_judge import JudgeLM
from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
from lighteval.tasks.requests import Doc
from lighteval.utils import as_list
Expand Down Expand Up @@ -622,11 +623,18 @@ def edit_similarity(self, s1, s2):


class JudgeLLM:
available_models = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
gpt_models = ["gpt-3.5-turbo"]

def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
if judge_model_name not in self.available_models:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
if judge_model_name in self.gpt_models:
judge_type = "openai"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this wouldn't work, if we pass in gpt-12 for example, it will set the judge type to openai and continue, only to fail later because gpt-12 does not exist in the openai api

else:
api = HfApi()
models = api.list_models(model_name=judge_model_name)
if models:
judge_type = "transformers"
else:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
self.multi_turn = multi_turn
Expand Down
Loading