forked from Py-Contributors/awesomeScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_redaction.py
57 lines (51 loc) · 1.86 KB
/
pdf_redaction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# imports
import fitz
import re
class Redactor:
# static methods work independent of class object
@staticmethod
def get_sensitive_data(lines, REG):
""" Function to get all the lines """
for line in lines:
# matching the regex to each line
if re.search(REG, line, re.IGNORECASE):
search = re.search(REG, line, re.IGNORECASE)
# yields creates a generator
# generator is used to return
# values in between function iterations
yield search.group(1)
# constructor
def __init__(self, path, reg):
self.path = path
self.reg = reg
def redaction(self):
""" main redactor code """
# opening the pdf
doc = fitz.open(self.path)
# iterating through pages
for page in doc:
# _wrapContents is needed for fixing
# alignment issues with rect boxes in some
# cases where there is alignment issue
page._wrapContents()
# geting the rect boxes which consists the matching email regex
sensitive = self.get_sensitive_data(
page.getText("text").split("\n"), self.reg
)
for data in sensitive:
areas = page.searchFor(data)
# drawing outline over sensitive datas
[page.addRedactAnnot(area, fill=(0, 0, 0)) for area in areas]
# applying the redaction
page.apply_redactions()
# saving it to a new pdf
doc.save("redacted_pdf.pdf")
print("Successfully redacted")
# driver code for testing
if __name__ == "__main__":
# replace it with name of the pdf file
path = "sample_pdf.pdf"
# email regex
EMAIL_REG = r"([\w\.\d]+\@[\w\d]+\.[\w\d]+)"
redactor = Redactor(path, EMAIL_REG)
redactor.redaction()