-
Notifications
You must be signed in to change notification settings - Fork 1
/
New.py
125 lines (101 loc) · 4.25 KB
/
New.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from os.path import splitext
from typing import List, Sequence, Tuple
import pandas as pd
from google.cloud import documentai
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "cac-smart-docs-5130258bf4be.json"
def online_process(
project_id: str,
location: str,
processor_id: str,
file_path: str,
mime_type: str,
) -> documentai.Document:
"""
Processes a document using the Document AI Online Processing API.
"""
opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}
# Instantiates a client
documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)
# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
resource_name = documentai_client.processor_path(project_id, location, processor_id)
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Load Binary Data into Document AI RawDocument Object
raw_document = documentai.RawDocument(
content=image_content, mime_type=mime_type
)
# Configure the process request
request = documentai.ProcessRequest(
name=resource_name, raw_document=raw_document
)
# Use the Document AI client to process the sample form
result = documentai_client.process_document(request=request)
return result.document
def get_table_data(
rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
) -> Tuple[List[List[str]], List[List[float]]]:
"""
Get text data and confidence scores from table rows.
"""
all_values: List[List[str]] = []
all_confidences: List[List[float]] = []
for row in rows:
current_row_values: List[str] = []
current_row_confidences: List[float] = []
for cell in row.cells:
current_row_values.append(text_anchor_to_text(cell.layout.text_anchor, text))
current_row_confidences.append(cell.layout.confidence)
all_values.append(current_row_values)
all_confidences.append(current_row_confidences)
return all_values, all_confidences
def text_anchor_to_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
"""
Document AI identifies table data by their offsets in the entirety of the
document's text. This function converts offsets to a string.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in text_anchor.text_segments:
start_index = int(segment.start_index)
end_index = int(segment.end_index)
response += text[start_index:end_index]
return response.strip().replace("\n", " ")
PROJECT_ID = "cac-smart-docs"
LOCATION = "us" # Format is 'us' or 'eu'
PROCESSOR_ID = "373b6eea27dcab8" # Create processor before running sample
# The local file in your current working directory
FILE_PATH = "./New Testing/0.7Black Pen.pdf"
# Refer to https://cloud.google.com/document-ai/docs/file-types
# for supported file types
MIME_TYPE = "application/pdf"
document = online_process(
project_id=PROJECT_ID,
location=LOCATION,
processor_id=PROCESSOR_ID,
file_path=FILE_PATH,
mime_type=MIME_TYPE,
)
header_row_values: List[List[str]] = []
header_row_confidences: List[List[float]] = []
body_row_values: List[List[str]] = []
body_row_confidences: List[List[float]] = []
# Input Filename without extension
output_file_prefix = splitext(FILE_PATH)[0]
for page in document.pages:
for index, table in enumerate(page.tables):
header_row_values, header_row_confidences = get_table_data(table.header_rows, document.text)
body_row_values, body_row_confidences = get_table_data(table.body_rows, document.text)
# Create a Pandas DataFrame to print the values in tabular format
df = pd.DataFrame(
data=body_row_values,
columns=pd.MultiIndex.from_arrays(header_row_values),
)
# Create a Pandas DataFrame for the confidence scores
# Save each table as a CSV file
output_filename = f"{output_file_prefix}_pg{page.page_number}_tb{index}.csv"
df.to_csv(output_filename, index=False)