From fdb22c7df00ad796c6070b5122d99e65627a180a Mon Sep 17 00:00:00 2001 From: anilaltuner Date: Fri, 26 Apr 2024 11:24:12 +0300 Subject: [PATCH 1/4] Transformers as Judge added --- src/lighteval/metrics/llm_as_judge.py | 114 ++++++++++++++---------- src/lighteval/metrics/metrics_sample.py | 12 +-- 2 files changed, 73 insertions(+), 53 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index 12b637a3..a7f31e08 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -25,29 +25,30 @@ import json import re import time -from typing import Optional +from typing import Optional, Any from openai import OpenAI from lighteval.logging.hierarchical_logger import hlog_warn +from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline -class JudgeOpenAI: +class JudgeLM: """ - A class representing a judge for evaluating answers using the OpenAI API. + A class representing a judge for evaluating answers using the Transformers library. Args: - model (str): The name of the OpenAI model to use. + model (str): The name of the model to use. seed (int): The seed value for generating random responses. temperature (float): The temperature value for controlling the randomness of the responses. templates_path (str): The path to the JSON file containing the templates for prompts. Attributes: - client: An instance of the OpenAI client. - model (str): The name of the OpenAI model. + model (str): The name of the model. seed (int): The seed value, passed to the API when generating responses. temperature (float): The temperature value, passed to the API when generating responses. templates (dict): A dictionary containing the templates for prompts. + judge_type (str): Judge type based on used model. one_score_pattern (re.Pattern): A regular expression pattern for extracting scores from the response. one_score_pattern_backup (re.Pattern): A backup regular expression pattern for extracting scores. API_MAX_RETRY (int): The maximum number of API retries. @@ -55,22 +56,23 @@ class JudgeOpenAI: max_tokens (int): The maximum number of tokens allowed in the response. Methods: - evaluate_answer: Evaluates an answer using the OpenAI API. + evaluate_answer: Evaluates an answer using the OpenAI API or Transformers library. __get_prompts_multi_turn: Generates prompts for multi-turn conversations. __get_prompts_single_turn: Generates prompts for single-turn conversations. __process_judge_response: Processes the judge's response and extracts the score. """ def __init__( - self, - model: str, - seed: int, - temperature: float, - templates_path: str, - openai_api_key: str, - multi_turn: bool = False, + self, + model: str, + seed: int, + temperature: float, + templates_path: str, + judge_type: str, + openai_api_key: Optional[str] = None, + multi_turn: bool = False, ): - self.client = OpenAI(api_key=openai_api_key) + self.model = model self.seed = seed self.temperature = temperature @@ -90,31 +92,44 @@ def __init__( self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]") self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]") - self.API_MAX_RETRY = 16 - self.API_RETRY_SLEEP = 10 - self.max_tokens = 2048 + if judge_type == "openai": + self.client = OpenAI(api_key=openai_api_key) + self.API_MAX_RETRY = 16 + self.API_RETRY_SLEEP = 10 + self.max_tokens = 2048 + else: + transformers_model = AutoModelForCausalLM.from_pretrained(model, + torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model) + self.pipe = pipeline( + "text-generation", + model=transformers_model, + tokenizer=tokenizer, + ) + self.generation_args = { + "max_new_tokens": 500, + "return_full_text": False, + "temperature": temperature, + "do_sample": False, + } def evaluate_answer( - self, questions: list[str], answers: list[str], references: list[str] - ) -> tuple[int, list[dict[str, str]], str]: + self, questions: list[str], answers: list[str], references: list[str] + ) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]: """ - Evaluates an answer using the OpenAI API. + Evaluates an answer using the OpenAI API or Transformers. Args: questions (list[str]): A list of questions (can be a list because of multi-turn conversations) answers (list[str]): A list of answers, one for each question. references (list[str]): A list of reference answers, one for each question (sometimes not available) - single_turn (bool): Indicates whether the conversation is single-turn or multi-turn. Returns: A tuple containing the score, prompts, and judgment. - - Raises: - Exception: If an error occurs during the API call. """ prompts = [ self.__get_prompts_single_turn( - questions[0], answers[0], references[0] if references is not None and len(references) > 0 else None + questions[0], answers[0], references[0] if references and len(references) > 0 else None ) ] @@ -124,34 +139,37 @@ def evaluate_answer( ) prompts.append(prompts_multi_turn) - responses = [] + judgments = [] for prompt in prompts: - for _ in range(self.API_MAX_RETRY): - try: - response = self.client.chat.completions.create( - model=self.model, - seed=self.seed, - temperature=self.temperature, - messages=prompt, - max_tokens=self.max_tokens, - n=1, - ) - responses.append(response) - break - except Exception as e: - hlog_warn(f"{type(e), e}") - time.sleep(self.API_RETRY_SLEEP) - - if len(responses) == 0: - raise Exception("Failed to get response from the API") - - judgments = [response.choices[0].message.content for response in responses] + if hasattr(self, 'client'): + response = self.__call_openai_api(prompt) + else: + response = self.pipe(prompt)[0]['generated_text'] + judgments.append(response) + scores = [self.__process_judge_response(judgment) for judgment in judgments] return scores, prompts, judgments + def __call_openai_api(self, prompt): + for _ in range(self.API_MAX_RETRY): + try: + response = self.client.chat.completions.create( + model=self.model, + seed=self.seed, + temperature=self.temperature, + messages=prompt, + max_tokens=self.max_tokens, + n=1, + ) + return response.choices[0].message.content + except Exception as e: + hlog_warn(f"{type(e), e}") + time.sleep(self.API_RETRY_SLEEP) + raise Exception("Failed to get response from the API") + def __get_prompts_multi_turn( - self, questions: list[str], answers: list[str], references: Optional[list[str]] + self, questions: list[str], answers: list[str], references: Optional[list[str]] ) -> list[dict[str, str]]: """ Generates prompts for multi-turn conversations. The prompts are generated based on the templates. diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index a3809adb..53581f7b 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -39,7 +39,7 @@ from lighteval.metrics.imports.bert_scorer import BERTScorer from lighteval.metrics.imports.data_stats_metric import DataStatsMetric from lighteval.metrics.imports.summac import SummaCZS -from lighteval.metrics.llm_as_judge import JudgeOpenAI +from lighteval.metrics.llm_as_judge import JudgeLM from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip from lighteval.tasks.requests import Doc from lighteval.utils import as_list @@ -619,28 +619,30 @@ def edit_similarity(self, s1, s2): edist = edit_distance(s1, s2) return 1.0 - edist / max(len(s1), len(s2)) if len(s1) > 0 and len(s2) > 0 else 0 - class JudgeLLM: available_models = ["gpt-3.5-turbo"] def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): if judge_model_name not in self.available_models: - raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") + judge_type = "openai" + else: + judge_type = "transformers" OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") self.multi_turn = multi_turn try: - self.judge = JudgeOpenAI( + self.judge = JudgeLM( model=judge_model_name, seed=42, temperature=0.0, templates_path=template_path, + judge_type=judge_type, openai_api_key=OPENAI_API_KEY, multi_turn=multi_turn, ) except Exception as e: - print(f"Could not initialize the JudgeOpenAI model:\n{e}") + print(f"Could not initialize the JudgeLLM model:\n{e}") self.judge = None def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: From 6906b04f3775becd0217ea9c0568b7a543b908f5 Mon Sep 17 00:00:00 2001 From: anilaltuner Date: Fri, 26 Apr 2024 11:24:58 +0300 Subject: [PATCH 2/4] Transformers as Judge added --- src/lighteval/metrics/metrics_sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 53581f7b..6b071d54 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -642,7 +642,7 @@ def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = multi_turn=multi_turn, ) except Exception as e: - print(f"Could not initialize the JudgeLLM model:\n{e}") + print(f"Could not initialize the JudgeLM model:\n{e}") self.judge = None def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]: From 8a33c121edab14619d4d0afb87ed5a19eec34a1c Mon Sep 17 00:00:00 2001 From: anilaltuner Date: Tue, 30 Apr 2024 16:58:29 +0300 Subject: [PATCH 3/4] Formatting fix --- src/lighteval/metrics/llm_as_judge.py | 34 ++++++++++++------------- src/lighteval/metrics/metrics_sample.py | 1 + 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index a7f31e08..9d717d0f 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -25,12 +25,12 @@ import json import re import time -from typing import Optional, Any +from typing import Any, Optional from openai import OpenAI +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from lighteval.logging.hierarchical_logger import hlog_warn -from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline class JudgeLM: @@ -63,16 +63,15 @@ class JudgeLM: """ def __init__( - self, - model: str, - seed: int, - temperature: float, - templates_path: str, - judge_type: str, - openai_api_key: Optional[str] = None, - multi_turn: bool = False, + self, + model: str, + seed: int, + temperature: float, + templates_path: str, + judge_type: str, + openai_api_key: Optional[str] = None, + multi_turn: bool = False, ): - self.model = model self.seed = seed self.temperature = temperature @@ -98,8 +97,9 @@ def __init__( self.API_RETRY_SLEEP = 10 self.max_tokens = 2048 else: - transformers_model = AutoModelForCausalLM.from_pretrained(model, - torch_dtype="auto", trust_remote_code=True) + transformers_model = AutoModelForCausalLM.from_pretrained( + model, torch_dtype="auto", trust_remote_code=True + ) tokenizer = AutoTokenizer.from_pretrained(model) self.pipe = pipeline( "text-generation", @@ -114,7 +114,7 @@ def __init__( } def evaluate_answer( - self, questions: list[str], answers: list[str], references: list[str] + self, questions: list[str], answers: list[str], references: list[str] ) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]: """ Evaluates an answer using the OpenAI API or Transformers. @@ -141,10 +141,10 @@ def evaluate_answer( judgments = [] for prompt in prompts: - if hasattr(self, 'client'): + if hasattr(self, "client"): response = self.__call_openai_api(prompt) else: - response = self.pipe(prompt)[0]['generated_text'] + response = self.pipe(prompt)[0]["generated_text"] judgments.append(response) scores = [self.__process_judge_response(judgment) for judgment in judgments] @@ -169,7 +169,7 @@ def __call_openai_api(self, prompt): raise Exception("Failed to get response from the API") def __get_prompts_multi_turn( - self, questions: list[str], answers: list[str], references: Optional[list[str]] + self, questions: list[str], answers: list[str], references: Optional[list[str]] ) -> list[dict[str, str]]: """ Generates prompts for multi-turn conversations. The prompts are generated based on the templates. diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 6b071d54..d9f828a1 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -619,6 +619,7 @@ def edit_similarity(self, s1, s2): edist = edit_distance(s1, s2) return 1.0 - edist / max(len(s1), len(s2)) if len(s1) > 0 and len(s2) > 0 else 0 + class JudgeLLM: available_models = ["gpt-3.5-turbo"] From aef34a7c206e743d0c446d5fda7396bfa94f9539 Mon Sep 17 00:00:00 2001 From: anilaltuner Date: Thu, 23 May 2024 11:52:05 +0300 Subject: [PATCH 4/4] Check model from HfApi --- src/lighteval/metrics/metrics_sample.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index d9f828a1..dc4a75ba 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -28,6 +28,7 @@ import nltk import numpy as np +from huggingface_hub import HfApi from nltk.metrics.distance import edit_distance from nltk.tokenize import word_tokenize from nltk.tokenize.treebank import TreebankWordTokenizer @@ -621,13 +622,18 @@ def edit_similarity(self, s1, s2): class JudgeLLM: - available_models = ["gpt-3.5-turbo"] + gpt_models = ["gpt-3.5-turbo"] def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False): - if judge_model_name not in self.available_models: + if judge_model_name in self.gpt_models: judge_type = "openai" else: - judge_type = "transformers" + api = HfApi() + models = api.list_models(model_name=judge_model_name) + if models: + judge_type = "transformers" + else: + raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") self.multi_turn = multi_turn