forked from Kororinpas/Lit_Tool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
doi.py
35 lines (32 loc) · 1.69 KB
/
doi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def get_doi(abstract):
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) # type: ignore
schema = Object(
id="doi",
description="doi is a digital identifier.It typically starts with 10. followed by a numeric prefix, such as 10.1000/182.",
attributes=[
Text(
id="doi",
description='doi is a digital identifier. It typically starts with "10." followed by a numeric prefix, such as 10.1000/182.',
examples=[
('American Economic Journal: Economic Policy 2015, 7(4): 223–242 http://dx.doi.org/10.1257/pol.20130367 223 Water Pollution Progress at Borders: The','http://dx.doi.org/10.1257/pol.20130367'),
('Environment and Development Economics (2020), 1–17 doi:10.1017/S1355770X2000025X EDE RESEARCH ARTICLE Political incentives, Party Congress, and pollution cycle: empirical evidence from China Zhihua Tian,1 and Yanfang Tian2* 1School of Economics, Zhejiang University of Technology, Hangzhou','10.1017/S1355770X2000025X')
],
many=True
)
],
many=False
)
chain = create_extraction_chain(llm, schema, encoder_or_encoder_class='json')
output = chain.predict_and_parse(text=abstract.page_content)
if 'doi' not in output['data']:
print(f"LLM strategy failed!!{abstract.metadata['source']} Please manually add it!!")
source = 'None'
return source
else:
doi = output['data']['doi']['doi'][0]
if 'doi=' in doi:
doi = doi.split('doi=')[1]
return doi