Skip to content

Commit

Permalink
Merge pull request #4 from mahdiramezanii/develop
Browse files Browse the repository at this point in the history
finish
  • Loading branch information
mahdiramezanii authored Jun 29, 2024
2 parents 547af0f + 02c3b25 commit a6860a6
Show file tree
Hide file tree
Showing 708 changed files with 118,809 additions and 66 deletions.
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
include static/*
recursive-include src/my_pkg/resources *
recursive-include persian_pdf_converter/*
13 changes: 0 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,6 @@ To install the package, use pip:
pip install persian-pdf-converter
```

### Install Tesseract

For the `pdf_to_word` function to work correctly, you need to have Tesseract OCR installed. You can download and install Tesseract from [here](https://github.com/tesseract-ocr/tesseract). After installation, make sure Tesseract's binary path is added to your system's PATH.

For Windows:
```bash
setx PATH "%PATH%;C:\Program Files\Tesseract-OCR"
```

For Unix-based systems (Linux, macOS):
```bash
export PATH=$PATH:/usr/local/bin
```

## Usage

Expand Down
66 changes: 66 additions & 0 deletions build/lib/persian_pdf_converter/imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@

#================================ import tqdm =====================================
'''Importing necessary libraries
'tqdm' is used for creating progress bars,
which are useful to track the progress of
loops.'''
from tqdm import tqdm
#================================================================================

#================================= import pdf2image ===========================
''''convert_from_path' from 'pdf2image'
library is used to convert PDF
pages into images.'''
from pdf2image import convert_from_path
#================================= End import pdf2image ===========================


#================================= import pytesseract =============================
''''pytesseract' is a wrapper for Google's
Tesseract-OCR Engine,
used for performing optical character
recognition (OCR) on images.'''
import pytesseract
#================================= End import pytesseract =============================

#================================= import PIL ===============================
''''Image' from 'PIL' (Python Imaging Library)
is used for opening,
manipulating, and saving many different
image file formats.'''
from PIL import Image
#================================= End import PIL ===============================


#=============================== import Document ==========================
# 'Document' from 'docx' library is used to create new Word documents.
from docx import Document
#=============================== End import Document ==========================


#================================ import WD_ALIGN_PARAGRAPH =====================
''''WD_ALIGN_PARAGRAPH' from 'docx.enum.text'
is used to align text in the Word document.'''
from docx.enum.text import WD_ALIGN_PARAGRAPH
#================================ End import WD_ALIGN_PARAGRAPH =====================


#================================ import tempfile ==================================
# 'tempfile' is used to create temporary files and directories.
import tempfile
#================================ End import tempfile ==================================

#================================ import persian_pdf_converter ==========================
''' '_my_random_string' is a custom function
from 'persian_pdf_converter'
used to generate random strings,
presumably for file naming.'''

from persian_pdf_converter import _my_random_string
#================================ End persian_pdf_converte =================================

#================================= import Path ========================
'''Path' from 'pathlib' is used for object-oriented
filesystem paths '''
from pathlib import Path
#================================= End import Path ========================
129 changes: 111 additions & 18 deletions build/lib/persian_pdf_converter/pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,156 @@
import os

from tqdm import tqdm
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from pathlib import Path
import tempfile
#========================== imports lib ==================================

from imports import *
from persian_pdf_converter import _my_random_string

from pathlib import Path
#============================ End imports =============================


''' Get the parent directory of
the current script file.'''
parent_path = Path(__file__).resolve().parent.parent


parent_path= Path(__file__).resolve().parent.parent
'''
Define a function to convert a PDF
file to a Word document.
'''
def pdf_to_word(pdf_path: str, output_dir: str, lang="fas+eng", **kwargs):

#==================================
'''Replace backslashes with forward
slashes in the output directory path
to ensure compatibility across different
operating systems.'''
output_dir = output_dir.replace("\\", "/")
# ==================================

#====================================================================
'''Generate a random name for the output
Word document using the custom
'_my_random_string' function.'''
pdf_name = f"word-{_my_random_string(6)}"
# ====================================================================



pages = convert_from_path(pdf_path,poppler_path=f"{parent_path}/static/poppler-24.02/bin")
#=====================================================================
'''Convert PDF pages into images.
'poppler_path' specifies the path to
the Poppler utility,
required for PDF to image conversion.'''
pages = convert_from_path(pdf_path,
poppler_path=f"{parent_path}/src/my_pkg/resources/poppler-24.02/bin")
# =====================================================================

pytesseract.pytesseract.tesseract_cmd = f"{parent_path}/static/Tesseract-OCR/tesseract.exe"

#==========================================================
# Set the path to the Tesseract OCR executable.
pytesseract.pytesseract.tesseract_cmd = \
f"{parent_path}/src/my_pkg/resources/Tesseract-OCR/tesseract.exe"
#=====================================================================

#========================================================================
# Initialize an empty list to store the
# extracted text from each page.
texts = []
#=========================================================================

#================================================================
# Loop through each page image and perform OCR to extract text.
for i, page in tqdm(enumerate(pages), position=0):
#===============================================================

#==============================================================
# Create a temporary directory to store the page image.
with tempfile.TemporaryDirectory() as img_dir:
# Define the file name for the image.
img_name = f'{pdf_name}-{i+1}.jpg'
# Define the full path for the image file.
img_path = Path(img_dir) / img_name
#==============================================================


#===================================================================
# Save the page as a JPEG image.
page.save(img_path, 'JPEG')


'''
Perform OCR on the saved image
to extract text.
'lang' specifies the languages
to use for OCR
(e.g., Persian and English).
'''

text = pytesseract.image_to_string(Image.open(img_path), lang=lang)
# Append the extracted text to the list.
texts.append(text)
#===================================================================


# Create a new Word document.
document = Document()

#==================================================================
'''
Set the default font and alignment for normal
text in the document.
'''
style_normal = document.styles['Normal']
font = style_normal.font
font.name = 'Arial'
#==================================================================

#===================================================================
'''Enable right-to-left text direction,
which is important for Persian text.'''
font.rtl = True
#=====================================================================



#=====================================================================
'''Set the font and alignment for heading
1 style text in the document.'''
style_h1 = document.styles['Heading 1']
font = style_h1.font
font.name = 'Arial'
#=======================================================================



# Enable right-to-left text direction for headings as well.
font.rtl = True

#=====================================================================
'''Loop through each extracted text
block and add it to the document.'''
for i, text in tqdm(enumerate(texts), position=0):
# Add a heading to indicate the page number.
heading = document.add_heading(f'صفحه: {i+1}', level=1)
# Align the heading to the right.
heading.alignment = WD_ALIGN_PARAGRAPH.RIGHT
# Apply the heading style to the heading.
heading.style = document.styles['Heading 1']

# Add the extracted text as a paragraph in the document.
paragraph = document.add_paragraph(text)
# Align the paragraph to the right.
paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
# Apply the normal style to the paragraph.
paragraph.style = document.styles['Normal']
#=========================================================================


#=========================================================================
# Define the output path for the Word document.
output_path = Path(output_dir) / f'{pdf_name}.docx'
# Save the Word document to the specified output path.
document.save(output_path)
#=========================================================================

return f'{pdf_name}.docx'


# Return the filename of the generated Word document.
return f'{pdf_name}.docx'
Empty file added build/lib/src/__init__.py
Empty file.
Empty file.
Loading

0 comments on commit a6860a6

Please sign in to comment.