forked from stanford-oval/storm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_storm_wiki_gpt_with_VectorRM.py
187 lines (168 loc) · 10.1 KB
/
run_storm_wiki_gpt_with_VectorRM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""
This STORM Wiki pipeline powered by GPT-3.5/4 and local retrieval model that uses Qdrant.
You need to set up the following environment variables to run this script:
- OPENAI_API_KEY: OpenAI API key
- OPENAI_API_TYPE: OpenAI API type (e.g., 'openai' or 'azure')
- QDRANT_API_KEY: Qdrant API key (needed ONLY if online vector store was used)
You will also need an existing Qdrant vector store either saved in a folder locally offline or in a server online.
If not, then you would need a CSV file with documents, and the script is going to create the vector store for you.
The CSV should be in the following format:
content | title | url | description
I am a document. | Document 1 | docu-n-112 | A self-explanatory document.
I am another document. | Document 2 | docu-l-13 | Another self-explanatory document.
Notice that the URL will be a unique identifier for the document so ensure different documents have different urls.
Output will be structured as below
args.output_dir/
topic_name/ # topic_name will follow convention of underscore-connected topic name w/o space and slash
conversation_log.json # Log of information-seeking conversation
raw_search_results.json # Raw search results from search engine
direct_gen_outline.txt # Outline directly generated with LLM's parametric knowledge
storm_gen_outline.txt # Outline refined with collected information
url_to_info.json # Sources that are used in the final article
storm_gen_article.txt # Final article generated
storm_gen_article_polished.txt # Polished final article (if args.do_polish_article is True)
"""
import os
import sys
from argparse import ArgumentParser
sys.path.append('./')
from knowledge_storm import STORMWikiRunnerArguments, STORMWikiRunner, STORMWikiLMConfigs
from knowledge_storm.rm import VectorRM
from knowledge_storm.lm import OpenAIModel, AzureOpenAIModel
from knowledge_storm.utils import load_api_key, QdrantVectorStoreManager
def main(args):
# Load API key from the specified toml file path
load_api_key(toml_file_path='secrets.toml')
# Initialize the language model configurations
engine_lm_configs = STORMWikiLMConfigs()
openai_kwargs = {
'api_key': os.getenv("OPENAI_API_KEY"),
'temperature': 1.0,
'top_p': 0.9,
}
ModelClass = OpenAIModel if os.getenv('OPENAI_API_TYPE') == 'openai' else AzureOpenAIModel
# If you are using Azure service, make sure the model name matches your own deployed model name.
# The default name here is only used for demonstration and may not match your case.
gpt_35_model_name = 'gpt-3.5-turbo' if os.getenv('OPENAI_API_TYPE') == 'openai' else 'gpt-35-turbo'
gpt_4_model_name = 'gpt-4o'
if os.getenv('OPENAI_API_TYPE') == 'azure':
openai_kwargs['api_base'] = os.getenv('AZURE_API_BASE')
openai_kwargs['api_version'] = os.getenv('AZURE_API_VERSION')
# STORM is a LM system so different components can be powered by different models.
# For a good balance between cost and quality, you can choose a cheaper/faster model for conv_simulator_lm
# which is used to split queries, synthesize answers in the conversation. We recommend using stronger models
# for outline_gen_lm which is responsible for organizing the collected information, and article_gen_lm
# which is responsible for generating sections with citations.
conv_simulator_lm = ModelClass(model=gpt_35_model_name, max_tokens=500, **openai_kwargs)
question_asker_lm = ModelClass(model=gpt_35_model_name, max_tokens=500, **openai_kwargs)
outline_gen_lm = ModelClass(model=gpt_4_model_name, max_tokens=400, **openai_kwargs)
article_gen_lm = ModelClass(model=gpt_4_model_name, max_tokens=700, **openai_kwargs)
article_polish_lm = ModelClass(model=gpt_4_model_name, max_tokens=4000, **openai_kwargs)
engine_lm_configs.set_conv_simulator_lm(conv_simulator_lm)
engine_lm_configs.set_question_asker_lm(question_asker_lm)
engine_lm_configs.set_outline_gen_lm(outline_gen_lm)
engine_lm_configs.set_article_gen_lm(article_gen_lm)
engine_lm_configs.set_article_polish_lm(article_polish_lm)
# Initialize the engine arguments
engine_args = STORMWikiRunnerArguments(
output_dir=args.output_dir,
max_conv_turn=args.max_conv_turn,
max_perspective=args.max_perspective,
search_top_k=args.search_top_k,
max_thread_num=args.max_thread_num,
)
# Create / update the vector store with the documents in the csv file
if args.csv_file_path:
kwargs = {
'file_path': args.csv_file_path,
'content_column': 'content',
'title_column': 'title',
'url_column': 'url',
'desc_column': 'description',
'batch_size': args.embed_batch_size,
'vector_db_mode': args.vector_db_mode,
'collection_name': args.collection_name,
'embedding_model': args.embedding_model,
'device': args.device,
}
if args.vector_db_mode == 'offline':
QdrantVectorStoreManager.create_or_update_vector_store(
vector_store_path=args.offline_vector_db_dir,
**kwargs
)
elif args.vector_db_mode == 'online':
QdrantVectorStoreManager.create_or_update_vector_store(
url=args.online_vector_db_url,
api_key=os.getenv('QDRANT_API_KEY'),
**kwargs
)
# Setup VectorRM to retrieve information from your own data
rm = VectorRM(collection_name=args.collection_name, embedding_model=args.embedding_model, device=args.device, k=engine_args.search_top_k)
# initialize the vector store, either online (store the db on Qdrant server) or offline (store the db locally):
if args.vector_db_mode == 'offline':
rm.init_offline_vector_db(vector_store_path=args.offline_vector_db_dir)
elif args.vector_db_mode == 'online':
rm.init_online_vector_db(url=args.online_vector_db_url, api_key=os.getenv('QDRANT_API_KEY'))
# Initialize the STORM Wiki Runner
runner = STORMWikiRunner(engine_args, engine_lm_configs, rm)
# run the pipeline
topic = input('Topic: ')
runner.run(
topic=topic,
do_research=args.do_research,
do_generate_outline=args.do_generate_outline,
do_generate_article=args.do_generate_article,
do_polish_article=args.do_polish_article,
)
runner.post_run()
runner.summary()
if __name__ == "__main__":
parser = ArgumentParser()
# global arguments
parser.add_argument('--output-dir', type=str, default='./results/gpt_retrieval',
help='Directory to store the outputs.')
parser.add_argument('--max-thread-num', type=int, default=3,
help='Maximum number of threads to use. The information seeking part and the article generation'
'part can speed up by using multiple threads. Consider reducing it if keep getting '
'"Exceed rate limit" error when calling LM API.')
# provide local corpus and set up vector db
parser.add_argument('--collection-name', type=str, default="my_documents",
help='The collection name for vector store.')
parser.add_argument('--embedding_model', type=str, default="BAAI/bge-m3",
help='The collection name for vector store.')
parser.add_argument('--device', type=str, default="mps",
help='The device used to run the retrieval model (mps, cuda, cpu, etc).')
parser.add_argument('--vector-db-mode', type=str, choices=['offline', 'online'],
help='The mode of the Qdrant vector store (offline or online).')
parser.add_argument('--offline-vector-db-dir', type=str, default='./vector_store',
help='If use offline mode, please provide the directory to store the vector store.')
parser.add_argument('--online-vector-db-url', type=str,
help='If use online mode, please provide the url of the Qdrant server.')
parser.add_argument('--csv-file-path', type=str, default=None,
help='The path of the custom document corpus in CSV format. The CSV file should include '
'content, title, url, and description columns.')
parser.add_argument('--embed-batch-size', type=int, default=64,
help='Batch size for embedding the documents in the csv file.')
# stage of the pipeline
parser.add_argument('--do-research', action='store_true',
help='If True, simulate conversation to research the topic; otherwise, load the results.')
parser.add_argument('--do-generate-outline', action='store_true',
help='If True, generate an outline for the topic; otherwise, load the results.')
parser.add_argument('--do-generate-article', action='store_true',
help='If True, generate an article for the topic; otherwise, load the results.')
parser.add_argument('--do-polish-article', action='store_true',
help='If True, polish the article by adding a summarization section and (optionally) removing '
'duplicate content.')
# hyperparameters for the pre-writing stage
parser.add_argument('--max-conv-turn', type=int, default=3,
help='Maximum number of questions in conversational question asking.')
parser.add_argument('--max-perspective', type=int, default=3,
help='Maximum number of perspectives to consider in perspective-guided question asking.')
parser.add_argument('--search-top-k', type=int, default=3,
help='Top k search results to consider for each search query.')
# hyperparameters for the writing stage
parser.add_argument('--retrieve-top-k', type=int, default=3,
help='Top k collected references for each section title.')
parser.add_argument('--remove-duplicate', action='store_true',
help='If True, remove duplicate content from the article.')
main(parser.parse_args())