-
Notifications
You must be signed in to change notification settings - Fork 2
/
pdf_documents.py
32 lines (25 loc) · 1001 Bytes
/
pdf_documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from pdf_metadata import get_pdf_metadata
from pdf_metadata_llm import get_pdf_metadata_using_llm
def get_pdf_documents(pdf_files):
from langchain.document_loaders import PyMuPDFLoader,DirectoryLoader,UnstructuredPDFLoader
docs =[]
import re
for pdf_fullpath in pdf_files:
metadata = get_pdf_metadata(pdf_fullpath)
if metadata != 'None':
doc = PyMuPDFLoader(pdf_fullpath).load()
for element in doc:
element.metadata = metadata
element.page_content = re.sub('\n+',' ',element.page_content.strip())
docs.append(element)
else:
doc = PyMuPDFLoader(pdf_fullpath).load()
print(f"{pdf_fullpath} is not identified! Using other strategy!!")
metadata = get_pdf_metadata_using_llm(doc)
if metadata != 'None':
for element in doc:
element.metadata = metadata
for element in doc:
element.page_content = re.sub('\n+',' ',element.page_content.strip())
docs.append(element)
return docs