From fdb22c7df00ad796c6070b5122d99e65627a180a Mon Sep 17 00:00:00 2001
From: anilaltuner <anil@firstbatch.xyz>
Date: Fri, 26 Apr 2024 11:24:12 +0300
Subject: [PATCH 1/4] Transformers as Judge added

---
 src/lighteval/metrics/llm_as_judge.py   | 114 ++++++++++++++----------
 src/lighteval/metrics/metrics_sample.py |  12 +--
 2 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py
index 12b637a3..a7f31e08 100644
--- a/src/lighteval/metrics/llm_as_judge.py
+++ b/src/lighteval/metrics/llm_as_judge.py
@@ -25,29 +25,30 @@
 import json
 import re
 import time
-from typing import Optional
+from typing import Optional, Any
 
 from openai import OpenAI
 
 from lighteval.logging.hierarchical_logger import hlog_warn
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
-class JudgeOpenAI:
+class JudgeLM:
     """
-    A class representing a judge for evaluating answers using the OpenAI API.
+    A class representing a judge for evaluating answers using the Transformers library.
 
     Args:
-        model (str): The name of the OpenAI model to use.
+        model (str): The name of the model to use.
         seed (int): The seed value for generating random responses.
         temperature (float): The temperature value for controlling the randomness of the responses.
         templates_path (str): The path to the JSON file containing the templates for prompts.
 
     Attributes:
-        client: An instance of the OpenAI client.
-        model (str): The name of the OpenAI model.
+        model (str): The name of the model.
         seed (int): The seed value, passed to the API when generating responses.
         temperature (float): The temperature value, passed to the API when generating responses.
         templates (dict): A dictionary containing the templates for prompts.
+        judge_type (str): Judge type based on used model.
         one_score_pattern (re.Pattern): A regular expression pattern for extracting scores from the response.
         one_score_pattern_backup (re.Pattern): A backup regular expression pattern for extracting scores.
         API_MAX_RETRY (int): The maximum number of API retries.
@@ -55,22 +56,23 @@ class JudgeOpenAI:
         max_tokens (int): The maximum number of tokens allowed in the response.
 
     Methods:
-        evaluate_answer: Evaluates an answer using the OpenAI API.
+        evaluate_answer: Evaluates an answer using the OpenAI API or Transformers library.
         __get_prompts_multi_turn: Generates prompts for multi-turn conversations.
         __get_prompts_single_turn: Generates prompts for single-turn conversations.
         __process_judge_response: Processes the judge's response and extracts the score.
     """
 
     def __init__(
-        self,
-        model: str,
-        seed: int,
-        temperature: float,
-        templates_path: str,
-        openai_api_key: str,
-        multi_turn: bool = False,
+            self,
+            model: str,
+            seed: int,
+            temperature: float,
+            templates_path: str,
+            judge_type: str,
+            openai_api_key: Optional[str] = None,
+            multi_turn: bool = False,
     ):
-        self.client = OpenAI(api_key=openai_api_key)
+
         self.model = model
         self.seed = seed
         self.temperature = temperature
@@ -90,31 +92,44 @@ def __init__(
         self.one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]")
         self.one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")
 
-        self.API_MAX_RETRY = 16
-        self.API_RETRY_SLEEP = 10
-        self.max_tokens = 2048
+        if judge_type == "openai":
+            self.client = OpenAI(api_key=openai_api_key)
+            self.API_MAX_RETRY = 16
+            self.API_RETRY_SLEEP = 10
+            self.max_tokens = 2048
+        else:
+            transformers_model = AutoModelForCausalLM.from_pretrained(model,
+                                                                      torch_dtype="auto", trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(model)
+            self.pipe = pipeline(
+                "text-generation",
+                model=transformers_model,
+                tokenizer=tokenizer,
+            )
+            self.generation_args = {
+                "max_new_tokens": 500,
+                "return_full_text": False,
+                "temperature": temperature,
+                "do_sample": False,
+            }
 
     def evaluate_answer(
-        self, questions: list[str], answers: list[str], references: list[str]
-    ) -> tuple[int, list[dict[str, str]], str]:
+            self, questions: list[str], answers: list[str], references: list[str]
+    ) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]:
         """
-        Evaluates an answer using the OpenAI API.
+        Evaluates an answer using the OpenAI API or Transformers.
 
         Args:
             questions (list[str]): A list of questions (can be a list because of multi-turn conversations)
             answers (list[str]): A list of answers, one for each question.
             references (list[str]): A list of reference answers, one for each question (sometimes not available)
-            single_turn (bool): Indicates whether the conversation is single-turn or multi-turn.
 
         Returns:
             A tuple containing the score, prompts, and judgment.
-
-        Raises:
-            Exception: If an error occurs during the API call.
         """
         prompts = [
             self.__get_prompts_single_turn(
-                questions[0], answers[0], references[0] if references is not None and len(references) > 0 else None
+                questions[0], answers[0], references[0] if references and len(references) > 0 else None
             )
         ]
 
@@ -124,34 +139,37 @@ def evaluate_answer(
             )
             prompts.append(prompts_multi_turn)
 
-        responses = []
+        judgments = []
         for prompt in prompts:
-            for _ in range(self.API_MAX_RETRY):
-                try:
-                    response = self.client.chat.completions.create(
-                        model=self.model,
-                        seed=self.seed,
-                        temperature=self.temperature,
-                        messages=prompt,
-                        max_tokens=self.max_tokens,
-                        n=1,
-                    )
-                    responses.append(response)
-                    break
-                except Exception as e:
-                    hlog_warn(f"{type(e), e}")
-                    time.sleep(self.API_RETRY_SLEEP)
-
-        if len(responses) == 0:
-            raise Exception("Failed to get response from the API")
-
-        judgments = [response.choices[0].message.content for response in responses]
+            if hasattr(self, 'client'):
+                response = self.__call_openai_api(prompt)
+            else:
+                response = self.pipe(prompt)[0]['generated_text']
+            judgments.append(response)
+
         scores = [self.__process_judge_response(judgment) for judgment in judgments]
 
         return scores, prompts, judgments
 
+    def __call_openai_api(self, prompt):
+        for _ in range(self.API_MAX_RETRY):
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model,
+                    seed=self.seed,
+                    temperature=self.temperature,
+                    messages=prompt,
+                    max_tokens=self.max_tokens,
+                    n=1,
+                )
+                return response.choices[0].message.content
+            except Exception as e:
+                hlog_warn(f"{type(e), e}")
+                time.sleep(self.API_RETRY_SLEEP)
+        raise Exception("Failed to get response from the API")
+
     def __get_prompts_multi_turn(
-        self, questions: list[str], answers: list[str], references: Optional[list[str]]
+            self, questions: list[str], answers: list[str], references: Optional[list[str]]
     ) -> list[dict[str, str]]:
         """
         Generates prompts for multi-turn conversations. The prompts are generated based on the templates.
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index a3809adb..53581f7b 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -39,7 +39,7 @@
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.imports.data_stats_metric import DataStatsMetric
 from lighteval.metrics.imports.summac import SummaCZS
-from lighteval.metrics.llm_as_judge import JudgeOpenAI
+from lighteval.metrics.llm_as_judge import JudgeLM
 from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
 from lighteval.tasks.requests import Doc
 from lighteval.utils import as_list
@@ -619,28 +619,30 @@ def edit_similarity(self, s1, s2):
         edist = edit_distance(s1, s2)
         return 1.0 - edist / max(len(s1), len(s2)) if len(s1) > 0 and len(s2) > 0 else 0
 
-
 class JudgeLLM:
     available_models = ["gpt-3.5-turbo"]
 
     def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
         if judge_model_name not in self.available_models:
-            raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
+            judge_type = "openai"
+        else:
+            judge_type = "transformers"
 
         OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
         self.multi_turn = multi_turn
 
         try:
-            self.judge = JudgeOpenAI(
+            self.judge = JudgeLM(
                 model=judge_model_name,
                 seed=42,
                 temperature=0.0,
                 templates_path=template_path,
+                judge_type=judge_type,
                 openai_api_key=OPENAI_API_KEY,
                 multi_turn=multi_turn,
             )
         except Exception as e:
-            print(f"Could not initialize the JudgeOpenAI model:\n{e}")
+            print(f"Could not initialize the JudgeLLM model:\n{e}")
             self.judge = None
 
     def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:

From 6906b04f3775becd0217ea9c0568b7a543b908f5 Mon Sep 17 00:00:00 2001
From: anilaltuner <anil@firstbatch.xyz>
Date: Fri, 26 Apr 2024 11:24:58 +0300
Subject: [PATCH 2/4] Transformers as Judge added

---
 src/lighteval/metrics/metrics_sample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 53581f7b..6b071d54 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -642,7 +642,7 @@ def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool =
                 multi_turn=multi_turn,
             )
         except Exception as e:
-            print(f"Could not initialize the JudgeLLM model:\n{e}")
+            print(f"Could not initialize the JudgeLM model:\n{e}")
             self.judge = None
 
     def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:

From 8a33c121edab14619d4d0afb87ed5a19eec34a1c Mon Sep 17 00:00:00 2001
From: anilaltuner <anil@firstbatch.xyz>
Date: Tue, 30 Apr 2024 16:58:29 +0300
Subject: [PATCH 3/4] Formatting fix

---
 src/lighteval/metrics/llm_as_judge.py   | 34 ++++++++++++-------------
 src/lighteval/metrics/metrics_sample.py |  1 +
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py
index a7f31e08..9d717d0f 100644
--- a/src/lighteval/metrics/llm_as_judge.py
+++ b/src/lighteval/metrics/llm_as_judge.py
@@ -25,12 +25,12 @@
 import json
 import re
 import time
-from typing import Optional, Any
+from typing import Any, Optional
 
 from openai import OpenAI
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
 from lighteval.logging.hierarchical_logger import hlog_warn
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
 class JudgeLM:
@@ -63,16 +63,15 @@ class JudgeLM:
     """
 
     def __init__(
-            self,
-            model: str,
-            seed: int,
-            temperature: float,
-            templates_path: str,
-            judge_type: str,
-            openai_api_key: Optional[str] = None,
-            multi_turn: bool = False,
+        self,
+        model: str,
+        seed: int,
+        temperature: float,
+        templates_path: str,
+        judge_type: str,
+        openai_api_key: Optional[str] = None,
+        multi_turn: bool = False,
     ):
-
         self.model = model
         self.seed = seed
         self.temperature = temperature
@@ -98,8 +97,9 @@ def __init__(
             self.API_RETRY_SLEEP = 10
             self.max_tokens = 2048
         else:
-            transformers_model = AutoModelForCausalLM.from_pretrained(model,
-                                                                      torch_dtype="auto", trust_remote_code=True)
+            transformers_model = AutoModelForCausalLM.from_pretrained(
+                model, torch_dtype="auto", trust_remote_code=True
+            )
             tokenizer = AutoTokenizer.from_pretrained(model)
             self.pipe = pipeline(
                 "text-generation",
@@ -114,7 +114,7 @@ def __init__(
             }
 
     def evaluate_answer(
-            self, questions: list[str], answers: list[str], references: list[str]
+        self, questions: list[str], answers: list[str], references: list[str]
     ) -> tuple[list[int], list[list[dict[str, str]]], list[str | None | Any]]:
         """
         Evaluates an answer using the OpenAI API or Transformers.
@@ -141,10 +141,10 @@ def evaluate_answer(
 
         judgments = []
         for prompt in prompts:
-            if hasattr(self, 'client'):
+            if hasattr(self, "client"):
                 response = self.__call_openai_api(prompt)
             else:
-                response = self.pipe(prompt)[0]['generated_text']
+                response = self.pipe(prompt)[0]["generated_text"]
             judgments.append(response)
 
         scores = [self.__process_judge_response(judgment) for judgment in judgments]
@@ -169,7 +169,7 @@ def __call_openai_api(self, prompt):
         raise Exception("Failed to get response from the API")
 
     def __get_prompts_multi_turn(
-            self, questions: list[str], answers: list[str], references: Optional[list[str]]
+        self, questions: list[str], answers: list[str], references: Optional[list[str]]
     ) -> list[dict[str, str]]:
         """
         Generates prompts for multi-turn conversations. The prompts are generated based on the templates.
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 6b071d54..d9f828a1 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -619,6 +619,7 @@ def edit_similarity(self, s1, s2):
         edist = edit_distance(s1, s2)
         return 1.0 - edist / max(len(s1), len(s2)) if len(s1) > 0 and len(s2) > 0 else 0
 
+
 class JudgeLLM:
     available_models = ["gpt-3.5-turbo"]
 

From aef34a7c206e743d0c446d5fda7396bfa94f9539 Mon Sep 17 00:00:00 2001
From: anilaltuner <anil@firstbatch.xyz>
Date: Thu, 23 May 2024 11:52:05 +0300
Subject: [PATCH 4/4] Check model from HfApi

---
 src/lighteval/metrics/metrics_sample.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index d9f828a1..dc4a75ba 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -28,6 +28,7 @@
 
 import nltk
 import numpy as np
+from huggingface_hub import HfApi
 from nltk.metrics.distance import edit_distance
 from nltk.tokenize import word_tokenize
 from nltk.tokenize.treebank import TreebankWordTokenizer
@@ -621,13 +622,18 @@ def edit_similarity(self, s1, s2):
 
 
 class JudgeLLM:
-    available_models = ["gpt-3.5-turbo"]
+    gpt_models = ["gpt-3.5-turbo"]
 
     def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
-        if judge_model_name not in self.available_models:
+        if judge_model_name in self.gpt_models:
             judge_type = "openai"
         else:
-            judge_type = "transformers"
+            api = HfApi()
+            models = api.list_models(model_name=judge_model_name)
+            if models:
+                judge_type = "transformers"
+            else:
+                raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")
 
         OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
         self.multi_turn = multi_turn