-
Notifications
You must be signed in to change notification settings - Fork 467
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature] Added Bradley-Terry subjective evaluation
- Loading branch information
Showing
13 changed files
with
1,230 additions
and
45 deletions.
There are no files selected for viewing
125 changes: 125 additions & 0 deletions
125
configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
from mmengine.config import read_base | ||
|
||
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_bradleyterry_postprocess | ||
from opencompass.openicl.icl_evaluator import LMEvaluator | ||
from opencompass.openicl.icl_inferencer import GenInferencer | ||
from opencompass.openicl.icl_prompt_template import PromptTemplate | ||
from opencompass.openicl.icl_retriever import ZeroRetriever | ||
|
||
subjective_reader_cfg = dict( | ||
input_columns=['question'], | ||
output_column='judge', | ||
) | ||
|
||
subjective_all_sets = [ | ||
'alpaca_eval', | ||
] | ||
|
||
|
||
alpacav2_datasets = [] | ||
|
||
gpt4_prompt = """ | ||
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective. | ||
## Instruction | ||
{ | ||
"instruction": "{question}", | ||
} | ||
## Model Outputs | ||
Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier. | ||
{ | ||
{ | ||
"model_identifier": "m", | ||
"output": "{prediction}" | ||
}, | ||
{ | ||
"model_identifier": "M", | ||
"output": "{prediction2}" | ||
} | ||
} | ||
## Task | ||
Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M. | ||
## Best Model Identifier | ||
""" | ||
|
||
api_meta_template = dict( | ||
round=[ | ||
dict(role='HUMAN', api_role='HUMAN'), | ||
dict(role='BOT', api_role='BOT', generate=True), | ||
], | ||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], | ||
) | ||
|
||
gpt4 = [ | ||
dict( | ||
abbr='gpt4-turbo', | ||
) | ||
] | ||
|
||
for _name in subjective_all_sets: | ||
subjective_infer_cfg = dict( | ||
prompt_template=dict( | ||
type=PromptTemplate, | ||
template=dict( | ||
round=[ | ||
dict(role='HUMAN', prompt='{question}'), | ||
] | ||
), | ||
), | ||
retriever=dict(type=ZeroRetriever), | ||
inferencer=dict(type=GenInferencer, max_out_len=4096), | ||
) | ||
|
||
subjective_eval_cfg = dict( | ||
evaluator=dict( | ||
type=LMEvaluator, | ||
prompt_template=dict( | ||
type=PromptTemplate, | ||
template=dict( | ||
begin=[ | ||
dict( | ||
role='SYSTEM', | ||
fallback_role='HUMAN', | ||
prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.', | ||
) | ||
], | ||
round=[ | ||
dict(role='HUMAN', prompt=gpt4_prompt), | ||
], | ||
), | ||
), | ||
dict_postprocessor=dict( | ||
type=alpacaeval_bradleyterry_postprocess, | ||
), | ||
keep_predictions=True, # Must be turned on to save predictions from model pairs to calculate style features in postprocessor | ||
), | ||
pred_role='BOT', | ||
) | ||
|
||
alpacav2_datasets.append( | ||
dict( | ||
abbr=f'{_name}', | ||
type=AlpacaEvalDataset, | ||
path='./data/subjective/alpaca_eval', | ||
name=_name, | ||
reader_cfg=subjective_reader_cfg, | ||
infer_cfg=subjective_infer_cfg, | ||
eval_cfg=subjective_eval_cfg, | ||
mode='m2n', | ||
infer_order='random', | ||
base_models=gpt4, | ||
given_pred=[ | ||
{ | ||
'abbr': 'gpt4-turbo', | ||
'path': './data/subjective/alpaca_eval/gpt4-turbo', | ||
} | ||
], | ||
) | ||
) |
173 changes: 173 additions & 0 deletions
173
configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
from opencompass.datasets import ( | ||
CompassArenaDataset, | ||
compassarena_bradleyterry_postprocess, | ||
) | ||
from opencompass.openicl.icl_evaluator import LMEvaluator | ||
from opencompass.openicl.icl_inferencer import GenInferencer | ||
from opencompass.openicl.icl_prompt_template import PromptTemplate | ||
from opencompass.openicl.icl_retriever import ZeroRetriever | ||
from opencompass.summarizers import CompassArenaSummarizer | ||
|
||
subjective_reader_cfg = dict( | ||
input_columns=['question', 'ref'], | ||
output_column='judge', | ||
) | ||
|
||
data_path = 'data/subjective/compass_arena' | ||
|
||
compassarena_datasets = [] | ||
|
||
base_prompt = """ | ||
[回答1开始] | ||
{prediction} | ||
[回答1结束] | ||
[回答2开始] | ||
{prediction2} | ||
[回答2结束] | ||
根据评分要求,在以下 3 个选项中做出选择: | ||
A. 回答1更好 | ||
B. 回答2更好 | ||
C. 回答1、2平局 | ||
并提供你的解释原因。 | ||
如果你认为回答1更好,你的输出应形如: | ||
选择:A | ||
原因:blahblah blahblah\n | ||
如果你认为回答2更好,你的输出应形如: | ||
选择:B | ||
原因:blahblah blahblah\n | ||
如果你认为回答1、2打成平手,你的输出应形如: | ||
选择:C | ||
原因:blahblah blahblah\n | ||
""" | ||
|
||
knowledge_prompt = ( | ||
""" | ||
请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 | ||
评分要求(重要性依次递减): | ||
1. 更好的回答能与参考答案吻合或表明参考答案的意思。 | ||
2. 在都准确答对问题的前提下,更好的回答能对知识点进行额外补充,且补充的知识准确无误。 | ||
3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。 | ||
[用户问题] | ||
{question} | ||
[参考答案] | ||
{ref} | ||
""" | ||
+ base_prompt | ||
) | ||
|
||
|
||
language_prompt = ( | ||
""" | ||
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 | ||
评分要求(重要性依次递减): | ||
1. 在有明确的参考答案的情况下,越贴近参考答案或表明了参考答案的意思的回答越好。 | ||
2. 更好的回答在语言表达上更流畅,更加符合与人类对话的习惯,包括语气、情调等 | ||
3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误。 | ||
[用户问题] | ||
{question} | ||
[参考答案] | ||
{ref} | ||
""" | ||
+ base_prompt | ||
) | ||
|
||
|
||
math_prompt = ( | ||
""" | ||
请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 | ||
评分要求(重要性依次递减): | ||
1. 更好的回答的答案能和参考答案一致。 | ||
2. 若两个回答的答案都与参考答案不一致,则更好的回答的推理过程应更加合理。 | ||
3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。 | ||
[用户问题] | ||
{question} | ||
[参考答案] | ||
{ref} | ||
""" | ||
+ base_prompt | ||
) | ||
|
||
reason_prompt = math_prompt | ||
|
||
creation_prompt = ( | ||
""" | ||
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 | ||
评分要求(重要性依次递减): | ||
1. 好的回答必须首先符合用户问题里的各种需求,不能跑题 | ||
2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答 | ||
3. 好的回答必须具有创造性的词语和表达丰富度 | ||
[用户问题] | ||
{question} | ||
""" | ||
+ base_prompt | ||
) | ||
|
||
sub_map = { | ||
'language': language_prompt, | ||
'knowledge': knowledge_prompt, | ||
'reason_v2': reason_prompt, | ||
'math_v2': math_prompt, | ||
'creationv2_zh': creation_prompt, | ||
} | ||
|
||
gpt4 = [ | ||
dict( | ||
abbr='gpt4-turbo', | ||
) | ||
] | ||
|
||
for _name, _prompt in sub_map.items(): | ||
subjective_infer_cfg = dict( | ||
prompt_template=dict( | ||
type=PromptTemplate, | ||
template=dict( | ||
round=[ | ||
dict(role='HUMAN', prompt='{question}'), | ||
] | ||
), | ||
), | ||
retriever=dict(type=ZeroRetriever), | ||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), | ||
) | ||
|
||
subjective_eval_cfg = dict( | ||
evaluator=dict( | ||
type=LMEvaluator, | ||
prompt_template=dict( | ||
type=PromptTemplate, | ||
template=dict( | ||
round=[ | ||
dict(role='HUMAN', prompt=_prompt), | ||
] | ||
), | ||
), | ||
dict_postprocessor=dict( | ||
type=compassarena_bradleyterry_postprocess, | ||
count_ties=True, | ||
), | ||
keep_predictions=True, # Must be turned on to save predictions from model pairs to calculate style features in postprocessor | ||
), | ||
pred_role='BOT', | ||
) | ||
|
||
compassarena_datasets.append( | ||
dict( | ||
abbr=f'compassarena_{_name}', | ||
type=CompassArenaDataset, | ||
path=data_path, | ||
name=_name, | ||
reader_cfg=subjective_reader_cfg, | ||
infer_cfg=subjective_infer_cfg, | ||
eval_cfg=subjective_eval_cfg, | ||
mode='m2n', | ||
infer_order='random', | ||
base_models=gpt4, | ||
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'), | ||
given_pred=[ | ||
{ | ||
'abbr': 'gpt4-turbo', | ||
'path': './data/subjective/compass_arena/gpt4-turbo', | ||
} | ||
], | ||
) | ||
) |
74 changes: 74 additions & 0 deletions
74
configs/datasets/subjective/wildbench/wildbench_pair_judge_bradleyterry.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
from opencompass.datasets import WildBenchDataset, wildbench_bradleyterry_postprocess | ||
from opencompass.openicl.icl_evaluator import LMEvaluator | ||
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer | ||
from opencompass.openicl.icl_prompt_template import PromptTemplate | ||
from opencompass.openicl.icl_retriever import ZeroRetriever | ||
from opencompass.summarizers import WildBenchPairSummarizer | ||
|
||
subjective_reader_cfg = dict( | ||
input_columns=['dialogue', 'prompt'], | ||
output_column='judge', | ||
) | ||
|
||
|
||
data_path = './data/subjective/WildBench/wildbench.jsonl' | ||
|
||
wildbench_datasets = [] | ||
subjective_infer_cfg = dict( | ||
prompt_template=dict(type=PromptTemplate, template="""{dialogue}"""), | ||
retriever=dict(type=ZeroRetriever), | ||
inferencer=dict(type=ChatInferencer, max_seq_len=32768, infer_mode='last'), | ||
) | ||
|
||
subjective_eval_cfg = dict( | ||
evaluator=dict( | ||
type=LMEvaluator, | ||
prompt_template=dict(type=PromptTemplate, template="""{prompt}"""), | ||
dict_postprocessor=dict(type=wildbench_bradleyterry_postprocess), | ||
keep_predictions=True, # Must be turned on to save predictions from model pairs to calculate style features in postprocessor | ||
), | ||
pred_role='BOT', | ||
) | ||
|
||
base_models = [ | ||
dict( | ||
abbr='gpt4-turbo', | ||
), | ||
dict( | ||
abbr='HaiKu', | ||
), | ||
dict( | ||
abbr='llama-2-70b-chat-hf', | ||
), | ||
] | ||
|
||
wildbench_datasets.append( | ||
dict( | ||
abbr='wildbench', | ||
type=WildBenchDataset, | ||
path=data_path, | ||
eval_mode='pair', | ||
reader_cfg=subjective_reader_cfg, | ||
infer_cfg=subjective_infer_cfg, | ||
eval_cfg=subjective_eval_cfg, | ||
given_pred=[ | ||
{'abbr': 'gpt4-turbo', 'path': './data/subjective/WildBench/gpt4'}, | ||
{ | ||
'abbr': 'llama-2-70b-chat-hf', | ||
'path': './data/subjective/WildBench/llama2-70b', | ||
}, | ||
{'abbr': 'HaiKu', 'path': './data/subjective/WildBench/claude'}, | ||
{ | ||
'abbr': 'llama-2-70b-chat-turbomind', | ||
'path': './data/subjective/WildBench/llama2-70b', | ||
}, | ||
{ | ||
'abbr': 'llama-2-70b-chat-vllm', | ||
'path': './data/subjective/WildBench/llama2-70b', | ||
}, | ||
], | ||
mode='m2n', # m个模型 与 n个模型进行对战 | ||
infer_order='random', | ||
base_models=base_models, | ||
) | ||
) |
Oops, something went wrong.