Unstructured-IO · beez2022 · Jun 5, 2024 · Jun 5, 2024 · Jun 6, 2024 · Jul 3, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ### Enhancements
 
+* **Added kwarg in partition_pdf() to skip ocr for certain element_types**. kWARG is 'pdf_skip_ocr_element_types' it is parameter that contains list of element types that will not be OCR-ed.
+
 ### Features
 
 ### Fixes

diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -519,6 +519,11 @@ def _partition_pdf_or_image_local(
         process_file_with_pdfminer,
     )
 
+    if 'pdf_skip_ocr_element_types' in kwargs.keys():
+        pdf_skip_ocr_element_types = kwargs['pdf_skip_ocr_element_types']
+    else:
+        pdf_skip_ocr_element_types = []
+
     if languages is None:
         languages = ["eng"]
 
@@ -585,6 +590,7 @@ def _partition_pdf_or_image_local(
                 infer_table_structure=infer_table_structure,
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
+                pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
                 pdf_image_dpi=pdf_image_dpi,
             )
     else:
@@ -626,6 +632,7 @@ def _partition_pdf_or_image_local(
                 infer_table_structure=infer_table_structure,
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
+                pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
                 pdf_image_dpi=pdf_image_dpi,
             )
 

diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -34,6 +34,8 @@ def process_data_with_ocr(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     ocr_mode: str = OCRMode.FULL_PAGE.value,
+# (BK) pdf_skip_ocr_element_types store elementt types that will not be ocr-ed (only works with ocr_mode="individual blocks")
+    pdf_skip_ocr_element_types: list = [],
     pdf_image_dpi: int = 200,
 ) -> "DocumentLayout":
     """
@@ -75,6 +77,7 @@ def process_data_with_ocr(
             infer_table_structure=infer_table_structure,
             ocr_languages=ocr_languages,
             ocr_mode=ocr_mode,
+            pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
             pdf_image_dpi=pdf_image_dpi,
         )
         return merged_layouts
@@ -89,6 +92,8 @@ def process_file_with_ocr(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     ocr_mode: str = OCRMode.FULL_PAGE.value,
+# (BK) pdf_skip_ocr_element_types stores element types that will not be ocr-ed (only works with ocr_mode="individual blocks")
+    pdf_skip_ocr_element_types: list = [],
     pdf_image_dpi: int = 200,
 ) -> "DocumentLayout":
     """
@@ -135,6 +140,7 @@ def process_file_with_ocr(
                         infer_table_structure=infer_table_structure,
                         ocr_languages=ocr_languages,
                         ocr_mode=ocr_mode,
+                        pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
                         extracted_regions=extracted_regions,
                     )
                     merged_page_layouts.append(merged_page_layout)
@@ -157,6 +163,7 @@ def process_file_with_ocr(
                             infer_table_structure=infer_table_structure,
                             ocr_languages=ocr_languages,
                             ocr_mode=ocr_mode,
+                            pdf_skip_ocr_element_types=pdf_skip_ocr_element_types,
                             extracted_regions=extracted_regions,
                         )
                         merged_page_layouts.append(merged_page_layout)
@@ -175,6 +182,7 @@ def supplement_page_layout_with_ocr(
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
     ocr_mode: str = OCRMode.FULL_PAGE.value,
+    pdf_skip_ocr_element_types: list=[],
     extracted_regions: Optional[List["TextRegion"]] = None,
 ) -> "PageLayout":
     """
@@ -198,23 +206,28 @@ def supplement_page_layout_with_ocr(
     elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
         for element in page_layout.elements:
             if not element.text:
-                padding = env_config.IMAGE_CROP_PAD
-                padded_element = pad_element_bboxes(element, padding=padding)
-                cropped_image = image.crop(
-                    (
-                        padded_element.bbox.x1,
-                        padded_element.bbox.y1,
-                        padded_element.bbox.x2,
-                        padded_element.bbox.y2,
-                    ),
-                )
+# (BK) any element type in pdf_skip_ocr_element_types will not be ocr-ed, it will return a value of -
+# if it is image, use the element type obtained from the model (e.g. Picture for Yolox)
+                if element.type in pdf_skip_ocr_element_types:
+                    element.text = "-"
+                else:
+                    padding = env_config.IMAGE_CROP_PAD
+                    padded_element = pad_element_bboxes(element, padding=padding)
+                    cropped_image = image.crop(
+                        (
+                            padded_element.bbox.x1,
+                            padded_element.bbox.y1,
+                            padded_element.bbox.x2,
+                            padded_element.bbox.y2,
+                        ),
+                    )
                 # Note(yuming): instead of getting OCR layout, we just need
                 # the text extraced from OCR for individual elements
-                text_from_ocr = ocr_agent.get_text_from_image(
-                    cropped_image,
-                    ocr_languages=ocr_languages,
-                )
-                element.text = text_from_ocr
+                    text_from_ocr = ocr_agent.get_text_from_image(
+                        cropped_image,
+                        ocr_languages=ocr_languages,
+                    )
+                    element.text = text_from_ocr
     else:
         raise ValueError(
             "Invalid OCR mode. Parameter `ocr_mode` "