-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
164 lines (133 loc) · 5.82 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os # to get system environment variables
import sys # to parse system arguments
import json # to dump and load data (lists) elegently
import pinecone # vector database service, core of the VDB query
from pathlib import Path # used to manipulate file paths elegently
from tqdm.auto import tqdm # to show progress bar
import time # to avoid RateLimitError
from langchain.text_splitter import CharacterTextSplitter # to split texts
from langchain.prompts import PromptTemplate # makes query easier
from langchain.llms import OpenAI # to query LLMs
from langchain.chains import LLMChain # makes query easier
from langchain.embeddings import OpenAIEmbeddings # to turn texts into vectors
def pinecone_init(index_name: str = 'notion-database'):
'''initialize connection to pinecone (get API key at app.pinecone.io)'''
# index_name = 'notion-database' # assigned as the default value
pinecone.init(
api_key=os.environ["PINECONE_API_KEY"],
environment="us-east1-gcp"
)
# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
# if does not exist, create index
pinecone.create_index(
index_name,
dimension=1536,
metric='cosine',
# metric='euclidean',
metadata_config={'indexed': ['channel_id', 'published']} # useless code, guess why im not deleting this yet?
)
# connect to index
index = pinecone.Index(index_name)
# view index status with
# index.describe_index_stats()
return index
# an error was met and solved upon retring && upgrading jupyter notebook with `pip install notebook --upgrade`
def md_digest(ps: list = list(Path("Notion_DB/").glob("**/*.md"))):
'''This is the logic for ingesting Notion data into LangChain.'''
# Here we load in the data in the format that Notion exports it in.
data = []
sources = []
for p in ps:
with open(p) as f:
data.append(f.read())
sources.append(p)
# We do this due to the context limits of the LLMs.
# chunk size is 1000, which means each chunk of text will be 1000 characters long, and that the separator is a new line
text_splitter = CharacterTextSplitter(chunk_size=1000, separator="\n")
docs = []
metadatas = []
for i, d in enumerate(data):
# where i, d is the index and content of each .md file respectively
splits = text_splitter.split_text(d)
docs.extend(splits)
metadatas.extend([{"source": sources[i]}] * len(splits))
# after digestion, we save the docs to local json files for later queries to avoid re-encoding.
with open('docs.json', 'w') as f:
json.dump(docs, f)
return docs
# question, will the data be too big/unspecific for each chunk?
# now len(docs) should be the number of vectors this is going to create
def pinecone_upload(docs: list = md_digest(), index=pinecone_init()):
'''This is the logic for uploading the data into Pinecone.'''
# upload to pinecone
id_batch = [str(x) for x in range(0, len(docs))]
coord_list = []
for i in tqdm(range(0, len(docs))):
# this line is added to avoid RateLimitError, where 60 second is a very random but conservative number.
# a stupid approach by me :D
rest = 60
if i != 0 and i % 60 == 0:
print(f"let's wait for {rest} seconds to avoid RateLimitError... \(since im not a paid user\))")
for i in tqdm(range(0, rest)):
time.sleep(1)
# get texts to encode
texts = docs[i]
coord = OpenAIEmbeddings().embed_query(texts)
coord_list.append(coord)
# prepare and upload the vectors to Pinecone
vectors = list(zip(id_batch, coord_list))
index.upsert(vectors)
def pinecone_query(query: str = "who are you", docs=md_digest(), index=pinecone_init()):
query_coord = OpenAIEmbeddings().embed_query(query)
# retrieve from Pinecone
# get relevant contexts (including the questions)
query_res = index.query(query_coord, top_k=3, include_metadata=True)
content_ids = [
int(x['id']) for x in query_res['matches']
]
contents = [docs[i] for i in content_ids]
contents_str = "\n\n".join(contents)
return contents_str
def ask_gpt3(query:str ="who are you",contents_str=pinecone_query()):
prompt = PromptTemplate(
input_variables=["question","contents"],
template=''' Answer this question: "{question}" using the contents below
Contents:
{contents}
Answer:
''',
)
chain = LLMChain(
llm=OpenAI(temperature=0),
prompt=prompt,
# verbose=True,
)
answer = chain.run(question=query,contents=contents_str)
return answer
def ans_cont_to_file(answer, contents_str):
# This last set of code is to write the answer and contents to text files.
with open ("answer.txt", "w") as f:
f.write(answer)
with open ("contents.txt", "w") as h:
h.write(contents_str)
def main():
print("initiating pinecone index...")
index = pinecone_init("notion-database")
directory, query = sys.argv[1], sys.argv[2]
print("digesting docs...")
docs = md_digest(list(Path(directory).glob("**/*.md")))
# docs = md_digest()
print("uploading datas to pinecone...")
pinecone_upload(docs, index)
print("querying pinecone...")
# query = input("ask a question")
contents_str = pinecone_query(query, docs)
print("querying gpt...")
answer = ask_gpt3(query=query, contents_str=contents_str)
# optimal, converts the answer and contents to text files
print("writing results to answer.txt and contents.txt")
ans_cont_to_file(answer, contents_str)
print(f"done! the answer to '{query}' is: '{answer}'")
if __name__ == "__main__":
main()