-
Notifications
You must be signed in to change notification settings - Fork 1
/
prompts.py
206 lines (156 loc) · 6.46 KB
/
prompts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
from textwrap import dedent
def get_pairwise_prompt_template(dataset, use_instruction=None):
if dataset == "SummEval":
prompt = dedent(
"""\
Source text: {{ input }}
Summary candidate A: {{ output_1 }}
Summary candidate B: {{ output_2 }}
Question: Evaluate and compare the coherence of the two summary candidates for the given source text. \
Which summary candidate has better coherence? \
If the candidate A is better, please return 'A'. \
If the candidate B is better, please return 'B'. \
You must return the choice only.
Answer: """
)
if use_instruction:
prompt = dedent(
"""\
Source text: {{ input }}
Summary candidate A: {{ output_1 }}
Summary candidate B: {{ output_2 }}
Question: {{ instruction }}
Answer: """
)
elif dataset == "newsroom":
prompt = dedent(
"""\
Source text: {{ input }}
Summary candidate A: {{ output_1 }}
Summary candidate B: {{ output_2 }}
Question: Evaluate and compare the coherence of the two summary candidates for the given source text. \
Which summary candidate has better coherence? \
If the candidate A is better, please return 'A'. \
If the candidate B is better, please return 'B'. \
You must return the choice only.
Answer: """
)
if use_instruction:
prompt = dedent(
"""\
Source text: {{ input }}
Summary candidate A: {{ output_1 }}
Summary candidate B: {{ output_2 }}
Question: {{ instruction }}
Answer: """
)
elif dataset == "TopicalChat":
prompt = dedent(
"""\
Dialog history:
{{ input }}
Response candidate A: {{ output_1 }}
Response candidate B: {{ output_2 }}
Question: Which response is overall better for the given dialog history? \
Please consider aspects including naturalness, understandability, context consistency and knowledge richness. \
If the candidate A is better, please return 'A'. \
If the candidate B is better, please return 'B'. \
You must return the choice only.
Answer: """
)
if use_instruction:
prompt = dedent(
"""\
Dialog history: {{ input }}
Response candidate A: {{ output_1 }}
Response candidate B: {{ output_2 }}
Question: {{ instruction }}
Answer: """
)
elif dataset == "GSM8k":
prompt = dedent(
"""\
Math question: {{ input }}
Solution candidate A: {{ output_1 }}
Solution candidate B: {{ output_2 }}
Instruction: Compare the quality of the two solution candidates for the given math question. \
Which solution candidate is better explained and more logical? \
If the candidate A is better, please return 'A'. \
If the candidate B is better, please return 'B'. \
You must only return your choice and make no explanation.
Answer: """
)
else:
assert False, f"Invalid dataset: {dataset}"
return prompt
def get_pointwise_prompt_template(dataset, with_input):
if with_input:
prompt = dedent(
"""\
Evaluate the overall quality of the following output candidate for the given input.
Input: {{ input }}
Output candidate: {{ output }}
Question: How would you rate the overall quality of the output candidate? \
Please provide a score between 1 and 10. \
You must return the score only.
Answer: """
)
else:
prompt = dedent(
"""\
Evaluate the overall quality of the following output candidate.
Output candidate: {{ output }}
Question: How would you rate the overall quality of the output candidate? \
Please provide a score between 1 and 10. \
You must return the score only.
Answer: """
)
return prompt
def get_cot_compare_prompt_template(dataset):
if dataset == "SummEval":
prompt = dedent(
"""\
Source text: {{ input }}
Summary candidate A: {{ output_1 }}
Summary candidate B: {{ output_2 }}
Instruction: Please briefly analyse and compare the coherence of the two summary candidates for the given source text, \
and then conclude which candidate is more coherent."""
)
elif dataset == "TopicalChat":
prompt = dedent(
"""\
Dialog history:
{{ input }}
Response candidate A: {{ output_1 }}
Response candidate B: {{ output_2 }}
Question: Which response is overall better for the given dialog history? \
Please consider aspects including naturalness, understandability, context consistency and knowledge richness. \
If the candidate A is better, please return 'A'. \
If the candidate B is better, please return 'B'. \
You must return the choice only.
Answer: """
)
elif dataset == "GSM8k":
prompt = dedent(
"""\
Math question: {{ input }}
Solution candidate A: {{ output_1 }}
Solution candidate B: {{ output_2 }}
Instruction: Analyse and compare the quality of the two solution candidates for the given math question. \
Please briefly discuss the strengths and weaknesses of both solution candidates and conclude which is more logical and correct?"""
)
else:
assert False, f"Invalid dataset: {dataset}"
return prompt
def get_cot_eval_prompt_template():
prompt = dedent(
"""\
{{ cot_response}}
Based on the above evaluation, which candidate is preferred according to the analysis?
If the candidate A is preferred, please return 'A'. \
If the candidate B is preferred, please return 'B'. \
If both candidates are equally preferred, please return 'C'. \
You must return the choice only.
Answer: """
)
return prompt