-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from mahdiramezanii/develop
finish
- Loading branch information
Showing
708 changed files
with
118,809 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
include static/* | ||
recursive-include src/my_pkg/resources * | ||
recursive-include persian_pdf_converter/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
|
||
#================================ import tqdm ===================================== | ||
'''Importing necessary libraries | ||
'tqdm' is used for creating progress bars, | ||
which are useful to track the progress of | ||
loops.''' | ||
from tqdm import tqdm | ||
#================================================================================ | ||
|
||
#================================= import pdf2image =========================== | ||
''''convert_from_path' from 'pdf2image' | ||
library is used to convert PDF | ||
pages into images.''' | ||
from pdf2image import convert_from_path | ||
#================================= End import pdf2image =========================== | ||
|
||
|
||
#================================= import pytesseract ============================= | ||
''''pytesseract' is a wrapper for Google's | ||
Tesseract-OCR Engine, | ||
used for performing optical character | ||
recognition (OCR) on images.''' | ||
import pytesseract | ||
#================================= End import pytesseract ============================= | ||
|
||
#================================= import PIL =============================== | ||
''''Image' from 'PIL' (Python Imaging Library) | ||
is used for opening, | ||
manipulating, and saving many different | ||
image file formats.''' | ||
from PIL import Image | ||
#================================= End import PIL =============================== | ||
|
||
|
||
#=============================== import Document ========================== | ||
# 'Document' from 'docx' library is used to create new Word documents. | ||
from docx import Document | ||
#=============================== End import Document ========================== | ||
|
||
|
||
#================================ import WD_ALIGN_PARAGRAPH ===================== | ||
''''WD_ALIGN_PARAGRAPH' from 'docx.enum.text' | ||
is used to align text in the Word document.''' | ||
from docx.enum.text import WD_ALIGN_PARAGRAPH | ||
#================================ End import WD_ALIGN_PARAGRAPH ===================== | ||
|
||
|
||
#================================ import tempfile ================================== | ||
# 'tempfile' is used to create temporary files and directories. | ||
import tempfile | ||
#================================ End import tempfile ================================== | ||
|
||
#================================ import persian_pdf_converter ========================== | ||
''' '_my_random_string' is a custom function | ||
from 'persian_pdf_converter' | ||
used to generate random strings, | ||
presumably for file naming.''' | ||
|
||
from persian_pdf_converter import _my_random_string | ||
#================================ End persian_pdf_converte ================================= | ||
|
||
#================================= import Path ======================== | ||
'''Path' from 'pathlib' is used for object-oriented | ||
filesystem paths ''' | ||
from pathlib import Path | ||
#================================= End import Path ======================== |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,63 +1,156 @@ | ||
import os | ||
|
||
from tqdm import tqdm | ||
from pdf2image import convert_from_path | ||
import pytesseract | ||
from PIL import Image | ||
from docx import Document | ||
from docx.enum.text import WD_ALIGN_PARAGRAPH | ||
from pathlib import Path | ||
import tempfile | ||
#========================== imports lib ================================== | ||
|
||
from imports import * | ||
from persian_pdf_converter import _my_random_string | ||
|
||
from pathlib import Path | ||
#============================ End imports ============================= | ||
|
||
|
||
''' Get the parent directory of | ||
the current script file.''' | ||
parent_path = Path(__file__).resolve().parent.parent | ||
|
||
|
||
parent_path= Path(__file__).resolve().parent.parent | ||
''' | ||
Define a function to convert a PDF | ||
file to a Word document. | ||
''' | ||
def pdf_to_word(pdf_path: str, output_dir: str, lang="fas+eng", **kwargs): | ||
|
||
#================================== | ||
'''Replace backslashes with forward | ||
slashes in the output directory path | ||
to ensure compatibility across different | ||
operating systems.''' | ||
output_dir = output_dir.replace("\\", "/") | ||
# ================================== | ||
|
||
#==================================================================== | ||
'''Generate a random name for the output | ||
Word document using the custom | ||
'_my_random_string' function.''' | ||
pdf_name = f"word-{_my_random_string(6)}" | ||
# ==================================================================== | ||
|
||
|
||
|
||
pages = convert_from_path(pdf_path,poppler_path=f"{parent_path}/static/poppler-24.02/bin") | ||
#===================================================================== | ||
'''Convert PDF pages into images. | ||
'poppler_path' specifies the path to | ||
the Poppler utility, | ||
required for PDF to image conversion.''' | ||
pages = convert_from_path(pdf_path, | ||
poppler_path=f"{parent_path}/src/my_pkg/resources/poppler-24.02/bin") | ||
# ===================================================================== | ||
|
||
pytesseract.pytesseract.tesseract_cmd = f"{parent_path}/static/Tesseract-OCR/tesseract.exe" | ||
|
||
#========================================================== | ||
# Set the path to the Tesseract OCR executable. | ||
pytesseract.pytesseract.tesseract_cmd = \ | ||
f"{parent_path}/src/my_pkg/resources/Tesseract-OCR/tesseract.exe" | ||
#===================================================================== | ||
|
||
#======================================================================== | ||
# Initialize an empty list to store the | ||
# extracted text from each page. | ||
texts = [] | ||
#========================================================================= | ||
|
||
#================================================================ | ||
# Loop through each page image and perform OCR to extract text. | ||
for i, page in tqdm(enumerate(pages), position=0): | ||
#=============================================================== | ||
|
||
#============================================================== | ||
# Create a temporary directory to store the page image. | ||
with tempfile.TemporaryDirectory() as img_dir: | ||
# Define the file name for the image. | ||
img_name = f'{pdf_name}-{i+1}.jpg' | ||
# Define the full path for the image file. | ||
img_path = Path(img_dir) / img_name | ||
#============================================================== | ||
|
||
|
||
#=================================================================== | ||
# Save the page as a JPEG image. | ||
page.save(img_path, 'JPEG') | ||
|
||
|
||
''' | ||
Perform OCR on the saved image | ||
to extract text. | ||
'lang' specifies the languages | ||
to use for OCR | ||
(e.g., Persian and English). | ||
''' | ||
|
||
text = pytesseract.image_to_string(Image.open(img_path), lang=lang) | ||
# Append the extracted text to the list. | ||
texts.append(text) | ||
#=================================================================== | ||
|
||
|
||
# Create a new Word document. | ||
document = Document() | ||
|
||
#================================================================== | ||
''' | ||
Set the default font and alignment for normal | ||
text in the document. | ||
''' | ||
style_normal = document.styles['Normal'] | ||
font = style_normal.font | ||
font.name = 'Arial' | ||
#================================================================== | ||
|
||
#=================================================================== | ||
'''Enable right-to-left text direction, | ||
which is important for Persian text.''' | ||
font.rtl = True | ||
#===================================================================== | ||
|
||
|
||
|
||
#===================================================================== | ||
'''Set the font and alignment for heading | ||
1 style text in the document.''' | ||
style_h1 = document.styles['Heading 1'] | ||
font = style_h1.font | ||
font.name = 'Arial' | ||
#======================================================================= | ||
|
||
|
||
|
||
# Enable right-to-left text direction for headings as well. | ||
font.rtl = True | ||
|
||
#===================================================================== | ||
'''Loop through each extracted text | ||
block and add it to the document.''' | ||
for i, text in tqdm(enumerate(texts), position=0): | ||
# Add a heading to indicate the page number. | ||
heading = document.add_heading(f'صفحه: {i+1}', level=1) | ||
# Align the heading to the right. | ||
heading.alignment = WD_ALIGN_PARAGRAPH.RIGHT | ||
# Apply the heading style to the heading. | ||
heading.style = document.styles['Heading 1'] | ||
|
||
# Add the extracted text as a paragraph in the document. | ||
paragraph = document.add_paragraph(text) | ||
# Align the paragraph to the right. | ||
paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT | ||
# Apply the normal style to the paragraph. | ||
paragraph.style = document.styles['Normal'] | ||
#========================================================================= | ||
|
||
|
||
#========================================================================= | ||
# Define the output path for the Word document. | ||
output_path = Path(output_dir) / f'{pdf_name}.docx' | ||
# Save the Word document to the specified output path. | ||
document.save(output_path) | ||
#========================================================================= | ||
|
||
return f'{pdf_name}.docx' | ||
|
||
|
||
# Return the filename of the generated Word document. | ||
return f'{pdf_name}.docx' |
Empty file.
Empty file.
Oops, something went wrong.