Merge pull request #4 from mahdiramezanii/develop

finish
mahdiramezanii · Jun 29, 2024 · a6860a6 · a6860a6
2 parents 547af0f + 02c3b25
commit a6860a6
Show file tree

Hide file tree

Showing 708 changed files with 118,809 additions and 66 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1,2 @@
-include static/*
+recursive-include src/my_pkg/resources *
+recursive-include persian_pdf_converter/*
diff --git a/README.md b/README.md
@@ -22,19 +22,6 @@ To install the package, use pip:
 pip install persian-pdf-converter
 ```
 
-### Install Tesseract
-
-For the `pdf_to_word` function to work correctly, you need to have Tesseract OCR installed. You can download and install Tesseract from [here](https://github.com/tesseract-ocr/tesseract). After installation, make sure Tesseract's binary path is added to your system's PATH.
-
-For Windows:
-```bash
-setx PATH "%PATH%;C:\Program Files\Tesseract-OCR"
-```
-
-For Unix-based systems (Linux, macOS):
-```bash
-export PATH=$PATH:/usr/local/bin
-```
 
 ## Usage
 

diff --git a/build/lib/persian_pdf_converter/imports.py b/build/lib/persian_pdf_converter/imports.py
@@ -0,0 +1,66 @@
+
+#================================  import tqdm  =====================================
+'''Importing necessary libraries
+'tqdm' is used for creating progress bars,
+which are useful to track the progress of
+loops.'''
+from tqdm import tqdm
+#================================================================================
+
+#=================================  import pdf2image ===========================
+''''convert_from_path' from 'pdf2image'
+ library is used to convert PDF
+pages into images.'''
+from pdf2image import convert_from_path
+#=================================  End import pdf2image ===========================
+
+
+#=================================   import pytesseract =============================
+''''pytesseract' is a wrapper for Google's
+ Tesseract-OCR Engine,
+used for performing optical character
+ recognition (OCR) on images.'''
+import pytesseract
+#================================= End  import pytesseract =============================
+
+#================================= import PIL ===============================
+''''Image' from 'PIL' (Python Imaging Library) 
+is used for opening,
+manipulating, and saving many different
+ image file formats.'''
+from PIL import Image
+#================================= End  import PIL ===============================
+
+
+#===============================  import Document ==========================
+# 'Document' from 'docx' library is used to create new Word documents.
+from docx import Document
+#=============================== End import Document ==========================
+
+
+#================================  import WD_ALIGN_PARAGRAPH  =====================
+''''WD_ALIGN_PARAGRAPH' from 'docx.enum.text'
+is used to align text in the Word document.'''
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+#================================ End import WD_ALIGN_PARAGRAPH  =====================
+
+
+#================================ import tempfile ==================================
+# 'tempfile' is used to create temporary files and directories.
+import tempfile
+#================================ End import tempfile ==================================
+
+#================================ import persian_pdf_converter  ==========================
+''' '_my_random_string' is a custom function
+ from 'persian_pdf_converter'
+used to generate random strings,
+ presumably for file naming.'''
+
+from persian_pdf_converter import _my_random_string
+#================================ End persian_pdf_converte =================================
+
+#================================= import Path   ========================
+'''Path' from 'pathlib' is used for object-oriented
+ filesystem paths '''
+from pathlib import Path
+#================================= End import Path  ========================
diff --git a/build/lib/persian_pdf_converter/pdf_converter.py b/build/lib/persian_pdf_converter/pdf_converter.py
@@ -1,63 +1,156 @@
-import os
-
-from tqdm import tqdm
-from pdf2image import convert_from_path
-import pytesseract
-from PIL import Image
-from docx import Document
-from docx.enum.text import WD_ALIGN_PARAGRAPH
-from pathlib import Path
-import tempfile
+#========================== imports lib  ==================================
+
+from imports import *
 from persian_pdf_converter import _my_random_string
 
-from pathlib import Path
+#============================ End imports =============================
+
+
+''' Get the parent directory of 
+the current script file.'''
+parent_path = Path(__file__).resolve().parent.parent
+
 
-parent_path= Path(__file__).resolve().parent.parent
+'''
 
+Define a function to convert a PDF
+file to a Word document.
 
+'''
 def pdf_to_word(pdf_path: str, output_dir: str, lang="fas+eng", **kwargs):
+
+    #==================================
+    '''Replace backslashes with forward
+     slashes in the output directory path
+     to ensure compatibility across different
+     operating systems.'''
     output_dir = output_dir.replace("\\", "/")
+    # ==================================
+
+    #====================================================================
+    '''Generate a random name for the output
+     Word document using the custom
+    '_my_random_string' function.'''
     pdf_name = f"word-{_my_random_string(6)}"
+    # ====================================================================
+
 
 
-    pages = convert_from_path(pdf_path,poppler_path=f"{parent_path}/static/poppler-24.02/bin")
+    #=====================================================================
+    '''Convert PDF pages into images.
+    'poppler_path' specifies the path to 
+    the Poppler utility,
+    required for PDF to image conversion.'''
+    pages = convert_from_path(pdf_path,
+                              poppler_path=f"{parent_path}/src/my_pkg/resources/poppler-24.02/bin")
+    # =====================================================================
 
-    pytesseract.pytesseract.tesseract_cmd = f"{parent_path}/static/Tesseract-OCR/tesseract.exe"
+
+    #==========================================================
+    # Set the path to the Tesseract OCR executable.
+    pytesseract.pytesseract.tesseract_cmd = \
+        f"{parent_path}/src/my_pkg/resources/Tesseract-OCR/tesseract.exe"
+    #=====================================================================
+
+    #========================================================================
+    # Initialize an empty list to store the
+    # extracted text from each page.
     texts = []
+    #=========================================================================
 
+    #================================================================
+    # Loop through each page image and perform OCR to extract text.
     for i, page in tqdm(enumerate(pages), position=0):
+    #===============================================================
+
+        #==============================================================
+        # Create a temporary directory to store the page image.
         with tempfile.TemporaryDirectory() as img_dir:
+            # Define the file name for the image.
             img_name = f'{pdf_name}-{i+1}.jpg'
+            # Define the full path for the image file.
             img_path = Path(img_dir) / img_name
+        #==============================================================
 
-
+            #===================================================================
+            # Save the page as a JPEG image.
             page.save(img_path, 'JPEG')
+
+
+            '''
+             Perform OCR on the saved image
+             to extract text.
+            'lang' specifies the languages
+             to use for OCR 
+            (e.g., Persian and English).
+            '''
+
             text = pytesseract.image_to_string(Image.open(img_path), lang=lang)
+            # Append the extracted text to the list.
             texts.append(text)
+            #===================================================================
+
 
+    # Create a new Word document.
     document = Document()
+
+    #==================================================================
+    '''
+    Set the default font and alignment for normal 
+    text in the document.
+    '''
     style_normal = document.styles['Normal']
     font = style_normal.font
     font.name = 'Arial'
+    #==================================================================
+
+    #===================================================================
+    '''Enable right-to-left text direction,
+    which is important for Persian text.'''
     font.rtl = True
+    #=====================================================================
+
+
 
+    #=====================================================================
+    '''Set the font and alignment for heading
+    1 style text in the document.'''
     style_h1 = document.styles['Heading 1']
     font = style_h1.font
     font.name = 'Arial'
+    #=======================================================================
+
+
+
+    # Enable right-to-left text direction for headings as well.
     font.rtl = True
 
+    #=====================================================================
+    '''Loop through each extracted text
+    block and add it to the document.'''
     for i, text in tqdm(enumerate(texts), position=0):
+        # Add a heading to indicate the page number.
         heading = document.add_heading(f'صفحه: {i+1}', level=1)
+        # Align the heading to the right.
         heading.alignment = WD_ALIGN_PARAGRAPH.RIGHT
+        # Apply the heading style to the heading.
         heading.style = document.styles['Heading 1']
 
+        # Add the extracted text as a paragraph in the document.
         paragraph = document.add_paragraph(text)
+        # Align the paragraph to the right.
         paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
+        # Apply the normal style to the paragraph.
         paragraph.style = document.styles['Normal']
+    #=========================================================================
 
+
+    #=========================================================================
+    # Define the output path for the Word document.
     output_path = Path(output_dir) / f'{pdf_name}.docx'
+    # Save the Word document to the specified output path.
     document.save(output_path)
+    #=========================================================================
 
-    return f'{pdf_name}.docx'
-
-
+    # Return the filename of the generated Word document.
+    return f'{pdf_name}.docx'
diff --git a/build/lib/src/__init__.py b/build/lib/src/__init__.py
diff --git a/build/lib/src/my_pkg/__init__.py b/build/lib/src/my_pkg/__init__.py