-
Notifications
You must be signed in to change notification settings - Fork 5
/
parse_table.py
117 lines (95 loc) · 3.46 KB
/
parse_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# parse_table_obj.py
#
# Description:
# This script is an object-oriented approach to table extraction from PDFs.
# It relies heavily on the Document and Page objects defined in parser.py.
import os
import shutil
import logging
import datetime
import itertools
import numpy as np
import pandas as pd
import multiprocessing as mp
from functools import partial
from parser import Document, Page
def clear_contents(dir_path):
'''
Deletes the contents of the given filepath. Useful for testing runs.
'''
filelist = os.listdir(dir_path)
if filelist:
for f in filelist:
if os.path.isdir(os.path.join(dir_path, f)):
shutil.rmtree(os.path.join(dir_path, f))
else:
os.remove(os.path.join(dir_path, f))
return None
def parse_document(data_dir, output_dir, doc_path):
'''
This is a separate function to facilitate parallelization.
Returns a dictionary in case of an error, else None.
'''
pdf_doc = Document(doc_path, data_dir, output_dir)
return pdf_doc.parse_doc()
def main(data_dir, output_dir):
'''
Main control flow:
1. Checks if required folders exist; if not, creates them
2. Loops over each PDF file in data_path and calls parse_pdf().
3. Output CSVs are written to output_path.
'''
# Clear output folder
clear_contents(output_dir)
# Check if organizing folders exist
for i in [data_dir, output_dir]:
try:
if i == data_dir and not os.path.exists(data_dir):
raise Exception("Data folder is missing or not assigned.")
else:
os.mkdir(i)
except FileExistsError:
continue
# Get list of pdfs to parse
pdf_list = [f for f in os.listdir(data_dir) if f.endswith(".pdf")]
logger.info(f"{len(pdf_list)} file(s) detected.")
# Initialize pool and parallelize processing
num_processes = mp.cpu_count()
with mp.Pool(processes=num_processes) as pool:
func = partial(parse_document, data_dir, output_dir)
results = pool.map(func, pdf_list)
errors = list(filter(None, results))
# Export errors to CSV if they exlist
if len(errors) > 0:
error_df = pd.DataFrame(list(itertools.chain(*errors)))
ERROR_PATH = os.path.join(output_dir, 'errors.csv')
logger.info(f"Completed with {len(errors)} errors. Exporting to {ERROR_PATH}")
error_df.to_csv(ERROR_PATH, index=False, columns=['document', 'page', 'error'])
else:
logger.info("Completed with no errors detected.")
# Non-parallel version
# Loop over PDF files, create Document objects, call Document.parse()
# for i in sorted(pdf_list):
# # Parse pdf
# logger.info(f"Parsing file: {os.path.join(data_dir, i)}")
# pdf_doc = Document(i, data_dir, output_dir)
# pdf_doc.parse_doc()
return None
if __name__ == "__main__":
# Key paths and parameters
DATA_DIR = "01_data"
OUTPUT_DIR = "02_output"
# Initialize logger
if os.path.exists('parse_table.log'):
os.remove('parse_table.log')
logger = logging.getLogger('parse_table')
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
fh = logging.FileHandler('parse_table.log')
logger.addHandler(ch)
logger.addHandler(fh)
# Run main control flow
start = datetime.datetime.now()
main(DATA_DIR, OUTPUT_DIR)
duration = datetime.datetime.now() - start
print(f"Time taken: {duration}")