-
Notifications
You must be signed in to change notification settings - Fork 4
/
handler.py
75 lines (65 loc) · 2.8 KB
/
handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
def handler(event, context):
"""
download tar.gz, do ocr and upload it to configured cloud service
(currently AWS S3 or Google Drive)
"""
upload_type = os.environ.get('UPLOAD_TYPE', None)
if upload_type == 'gdrive':
for k in ['GDRIVE_CLIENT_ID', 'GDRIVE_CLIENT_SECRET', 'GDRIVE_REFRESH_TOKEN']:
assert k in os.environ, "missing {} in environment vars".format(k)
elif upload_type == 's3':
assert 'S3_BUCKET' in os.environ
elif upload_type == 'discard':
pass
else:
raise Exception('unknown upload type {}'.format(os.environ['UPLOAD_TYPE']))
empty_page_threshold = int(os.environ.get('EMPTY_PAGE_THRESHOLD', 200))
language = os.environ.get('TESSERACT_LANG', 'eng')
import boto3, ocr
s3 = boto3.client('s3')
for record in event['Records']:
src_bucket = record['s3']['bucket']['name']
src_file = record['s3']['object']['key']
s3.download_file(src_bucket, src_file, "/tmp/scan.tar.gz")
pdf_file = ocr.ocr("/tmp/scan.tar.gz", empty_page_threshold, language)
dest_filename = src_file.split('.')[0] + '.pdf'
if upload_type == 's3':
bucket = os.environ['S3_BUCKET']
s3.upload_file(pdf_file, bucket, dest_filename)
elif upload_type == 'gdrive':
folder = os.environ.get('GDRIVE_FOLDER', None)
client_id = os.environ['GDRIVE_CLIENT_ID']
client_secret = os.environ['GDRIVE_CLIENT_SECRET']
refresh_token = os.environ['GDRIVE_REFRESH_TOKEN']
upload_gdrive(pdf_file, dest_filename, client_id, client_secret, refresh_token, folder)
elif upload_type == 'discard':
print('all fine, discarding file, but not deleting source file')
return
s3.delete_object(Bucket=src_bucket, Key=src_file)
os.remove(pdf_file)
def upload_gdrive(file_src, file_dest, client_id, client_secret, refresh_token, folder=None):
import httplib2
import os
from apiclient import discovery
from oauth2client import client
from apiclient.http import MediaFileUpload
credentials = client.GoogleCredentials(None,
client_id,
client_secret,
refresh_token,
None,
"https://accounts.google.com/o/oauth2/token",
'scanner-ocr')
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http, cache_discovery=False)
file_metadata = {
'name': file_dest,
}
if folder is not None:
file_metadata['parents'] = [folder]
media = MediaFileUpload(file_src,
mimetype='application/pdf')
file = service.files().create(body=file_metadata,
media_body=media,
fields='id').execute()