From a38a0aa594bd9fda27149d4145f229cf56723eb8 Mon Sep 17 00:00:00 2001
From: beez2022 <101962834+beez2022@users.noreply.github.com>
Date: Wed, 5 Jun 2024 14:24:32 +0800
Subject: [PATCH 1/4] include no_ocr_elements

---
 unstructured/partition/pdf.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 724145b0b5..4e5bc85367 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -519,6 +519,11 @@ def _partition_pdf_or_image_local(
         process_file_with_pdfminer,
     )
 
+    if 'no_ocr_elements' in kwargs.keys():
+        no_ocr_elements = kwargs['no_ocr_elements']
+    else:
+        no_ocr_elements = []
+
     if languages is None:
         languages = ["eng"]
 
@@ -585,6 +590,7 @@ def _partition_pdf_or_image_local(
                 infer_table_structure=infer_table_structure,
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
+                no_ocr_elements=no_ocr_elements,
                 pdf_image_dpi=pdf_image_dpi,
             )
     else:
@@ -626,6 +632,7 @@ def _partition_pdf_or_image_local(
                 infer_table_structure=infer_table_structure,
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
+                no_ocr_elements=no_ocr_elements,
                 pdf_image_dpi=pdf_image_dpi,
             )
 

From 649d1108eab54235d00a050476aa201314061e6f Mon Sep 17 00:00:00 2001
From: beez2022 <101962834+beez2022@users.noreply.github.com>
Date: Wed, 5 Jun 2024 14:34:33 +0800
Subject: [PATCH 2/4] added no_ocr_elements ocr.py

---
 unstructured/partition/pdf_image/ocr.py | 42 ++++++++++++++++---------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
index 39ca6f995e..28e998e063 100644
--- a/unstructured/partition/pdf_image/ocr.py
+++ b/unstructured/partition/pdf_image/ocr.py
@@ -34,6 +34,8 @@ def process_data_with_ocr(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     ocr_mode: str = OCRMode.FULL_PAGE.value,
+# (BK) no_ocr_elements store elements that will not be ocr-ed (only works with ocr_mode="individual blocks")
+    no_ocr_elements: list = [],
     pdf_image_dpi: int = 200,
 ) -> "DocumentLayout":
     """
@@ -75,6 +77,7 @@ def process_data_with_ocr(
             infer_table_structure=infer_table_structure,
             ocr_languages=ocr_languages,
             ocr_mode=ocr_mode,
+            no_ocr_elements=no_ocr_elements
             pdf_image_dpi=pdf_image_dpi,
         )
         return merged_layouts
@@ -89,6 +92,8 @@ def process_file_with_ocr(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     ocr_mode: str = OCRMode.FULL_PAGE.value,
+# (BK) no_ocr_elements store elements that will not be ocr-ed (only works with ocr_mode="individual blocks")
+    no_ocr_elements: list = [],
     pdf_image_dpi: int = 200,
 ) -> "DocumentLayout":
     """
@@ -135,6 +140,7 @@ def process_file_with_ocr(
                         infer_table_structure=infer_table_structure,
                         ocr_languages=ocr_languages,
                         ocr_mode=ocr_mode,
+                        no_ocr_elements=no_ocr_elements,
                         extracted_regions=extracted_regions,
                     )
                     merged_page_layouts.append(merged_page_layout)
@@ -157,6 +163,7 @@ def process_file_with_ocr(
                             infer_table_structure=infer_table_structure,
                             ocr_languages=ocr_languages,
                             ocr_mode=ocr_mode,
+                            no_ocr_elements=no_ocr_elements,
                             extracted_regions=extracted_regions,
                         )
                         merged_page_layouts.append(merged_page_layout)
@@ -175,6 +182,7 @@ def supplement_page_layout_with_ocr(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     ocr_mode: str = OCRMode.FULL_PAGE.value,
+    no_ocr_elements: list=[],
     extracted_regions: Optional[List["TextRegion"]] = None,
 ) -> "PageLayout":
     """
@@ -198,23 +206,27 @@ def supplement_page_layout_with_ocr(
     elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
         for element in page_layout.elements:
             if not element.text:
-                padding = env_config.IMAGE_CROP_PAD
-                padded_element = pad_element_bboxes(element, padding=padding)
-                cropped_image = image.crop(
-                    (
-                        padded_element.bbox.x1,
-                        padded_element.bbox.y1,
-                        padded_element.bbox.x2,
-                        padded_element.bbox.y2,
-                    ),
-                )
+# any element in no_ocr_elements will not be ocr-ed, it will return a value of -
+                if element.type in no_ocr_elements:
+                    element.text = "-"
+                else:
+                    padding = env_config.IMAGE_CROP_PAD
+                    padded_element = pad_element_bboxes(element, padding=padding)
+                    cropped_image = image.crop(
+                        (
+                            padded_element.bbox.x1,
+                            padded_element.bbox.y1,
+                            padded_element.bbox.x2,
+                            padded_element.bbox.y2,
+                        ),
+                    )
                 # Note(yuming): instead of getting OCR layout, we just need
                 # the text extraced from OCR for individual elements
-                text_from_ocr = ocr_agent.get_text_from_image(
-                    cropped_image,
-                    ocr_languages=ocr_languages,
-                )
-                element.text = text_from_ocr
+                    text_from_ocr = ocr_agent.get_text_from_image(
+                        cropped_image,
+                        ocr_languages=ocr_languages,
+                    )
+                    element.text = text_from_ocr
     else:
         raise ValueError(
             "Invalid OCR mode. Parameter `ocr_mode` "

From 33474d1ee2a028a4987e003f6858f3c5a5fd1493 Mon Sep 17 00:00:00 2001
From: ext-seahbeekheng <seahbeekheng1@spgroup.com.sg>
Date: Thu, 6 Jun 2024 14:18:00 +0800
Subject: [PATCH 3/4] added key argument to indicate list of element types that
 do not need ocr

---
 unstructured/partition/pdf.py           | 10 +++++-----
 unstructured/partition/pdf_image/ocr.py | 21 +++++++++++----------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 4e5bc85367..f42cd18fe2 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -519,10 +519,10 @@ def _partition_pdf_or_image_local(
         process_file_with_pdfminer,
     )
 
-    if 'no_ocr_elements' in kwargs.keys():
-        no_ocr_elements = kwargs['no_ocr_elements']
+    if 'pdf_skip_ocr_element_types' in kwargs.keys():
+        pdf_skip_ocr_element_types = kwargs['pdf_skip_ocr_element_types']
     else:
-        no_ocr_elements = []
+        pdf_skip_ocr_element_types = []
 
     if languages is None:
         languages = ["eng"]
@@ -590,7 +590,7 @@ def _partition_pdf_or_image_local(
                 infer_table_structure=infer_table_structure,
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
-                no_ocr_elements=no_ocr_elements,
+                pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
                 pdf_image_dpi=pdf_image_dpi,
             )
     else:
@@ -632,7 +632,7 @@ def _partition_pdf_or_image_local(
                 infer_table_structure=infer_table_structure,
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
-                no_ocr_elements=no_ocr_elements,
+                pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
                 pdf_image_dpi=pdf_image_dpi,
             )
 
diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
index 28e998e063..fc71252e14 100644
--- a/unstructured/partition/pdf_image/ocr.py
+++ b/unstructured/partition/pdf_image/ocr.py
@@ -34,8 +34,8 @@ def process_data_with_ocr(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     ocr_mode: str = OCRMode.FULL_PAGE.value,
-# (BK) no_ocr_elements store elements that will not be ocr-ed (only works with ocr_mode="individual blocks")
-    no_ocr_elements: list = [],
+# (BK) pdf_skip_ocr_element_types store elementt types that will not be ocr-ed (only works with ocr_mode="individual blocks")
+    pdf_skip_ocr_element_types: list = [],
     pdf_image_dpi: int = 200,
 ) -> "DocumentLayout":
     """
@@ -77,7 +77,7 @@ def process_data_with_ocr(
             infer_table_structure=infer_table_structure,
             ocr_languages=ocr_languages,
             ocr_mode=ocr_mode,
-            no_ocr_elements=no_ocr_elements
+            pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
             pdf_image_dpi=pdf_image_dpi,
         )
         return merged_layouts
@@ -92,8 +92,8 @@ def process_file_with_ocr(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     ocr_mode: str = OCRMode.FULL_PAGE.value,
-# (BK) no_ocr_elements store elements that will not be ocr-ed (only works with ocr_mode="individual blocks")
-    no_ocr_elements: list = [],
+# (BK) pdf_skip_ocr_element_types stores element types that will not be ocr-ed (only works with ocr_mode="individual blocks")
+    pdf_skip_ocr_element_types: list = [],
     pdf_image_dpi: int = 200,
 ) -> "DocumentLayout":
     """
@@ -140,7 +140,7 @@ def process_file_with_ocr(
                         infer_table_structure=infer_table_structure,
                         ocr_languages=ocr_languages,
                         ocr_mode=ocr_mode,
-                        no_ocr_elements=no_ocr_elements,
+                        pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
                         extracted_regions=extracted_regions,
                     )
                     merged_page_layouts.append(merged_page_layout)
@@ -163,7 +163,7 @@ def process_file_with_ocr(
                             infer_table_structure=infer_table_structure,
                             ocr_languages=ocr_languages,
                             ocr_mode=ocr_mode,
-                            no_ocr_elements=no_ocr_elements,
+                            pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
                             extracted_regions=extracted_regions,
                         )
                         merged_page_layouts.append(merged_page_layout)
@@ -182,7 +182,7 @@ def supplement_page_layout_with_ocr(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     ocr_mode: str = OCRMode.FULL_PAGE.value,
-    no_ocr_elements: list=[],
+    pdf_skip_ocr_element_types: list=[],
     extracted_regions: Optional[List["TextRegion"]] = None,
 ) -> "PageLayout":
     """
@@ -206,8 +206,9 @@ def supplement_page_layout_with_ocr(
     elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
         for element in page_layout.elements:
             if not element.text:
-# any element in no_ocr_elements will not be ocr-ed, it will return a value of -
-                if element.type in no_ocr_elements:
+# (BK) any element type in pdf_skip_ocr_element_types will not be ocr-ed, it will return a value of -
+# if it is image, use the element type obtained from the model (e.g. Picture for Yolox)
+                if element.type in pdf_skip_ocr_element_types:
                     element.text = "-"
                 else:
                     padding = env_config.IMAGE_CROP_PAD

From c8e3f14878723dd554fca49cccc088784dc65798 Mon Sep 17 00:00:00 2001
From: ext-seahbeekheng <seahbeekheng1@spgroup.com.sg>
Date: Wed, 3 Jul 2024 14:00:35 +0800
Subject: [PATCH 4/4] updated changelog.md

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2e77a4db5c..5d659cc777 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ### Enhancements
 
+* **Added kwarg in partition_pdf() to skip ocr for certain element_types**. kWARG is 'pdf_skip_ocr_element_types' it is parameter that contains list of element types that will not be OCR-ed.
+
 ### Features
 
 ### Fixes