-
Notifications
You must be signed in to change notification settings - Fork 2
/
arxiv_utls.py
72 lines (59 loc) · 2.47 KB
/
arxiv_utls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import arxiv
from langchain import OpenAI, PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import get_gai_completion
def download_pdf_from_arxiv(arxiv_id: str, filename="downloaded-paper.pdf"):
paper = next(arxiv.Search(id_list=[arxiv_id]).results())
dest_file = paper.download_pdf(filename=filename)
return paper, dest_file
def extract_text_from_arxiv_pdf(arxiv_id, chars_per_chunk, overlap_chars):
paper, dest_file = download_pdf_from_arxiv(arxiv_id)
loader = PyPDFLoader(dest_file)
test_splitter = RecursiveCharacterTextSplitter( # Set a really small chunk size, just to show.
chunk_size=chars_per_chunk, chunk_overlap=overlap_chars, length_function=len
)
return paper.title, loader.load_and_split(text_splitter=test_splitter)
def summarize_arxiv_paper(text, style, style_items, language):
if style == "paragraph":
summary_style = f"{style_items} paragraph"
elif style == "bulletpoints":
summary_style = f"{style_items} bullet points"
elif style == "sonnet":
summary_style = "a sonnet style"
else:
summary_style = "a single sentence"
prompt = f""""
Summarize the technical text, delimited by triple
backticks, in {language} in {summary_style}:
```{text}```
"""
return get_gai_completion(prompt)
def summarize_arxiv_paper_lc(docs, style, style_items, language):
if style == "paragraph":
summary_style = f"{style_items} paragraph"
elif style == "bulletpoints":
summary_style = f"{style_items} bullet points"
elif style == "sonnet":
summary_style = "a sonnet style"
else:
summary_style = "a single sentence"
prompt_template = """
Summarize the technical text, delimited by triple
=, in {language} in {summary_style}:
==={text}===
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text", "language", "summary_style"])
chain = load_summarize_chain(
OpenAI(temperature=0),
chain_type="map_reduce",
return_intermediate_steps=True,
map_prompt=PROMPT,
combine_prompt=PROMPT,
)
output = chain(
{"input_documents": docs, "language": language, "summary_style": summary_style}, return_only_outputs=True
)
print(output)
return output["output_text"]