From a38a0aa594bd9fda27149d4145f229cf56723eb8 Mon Sep 17 00:00:00 2001 From: beez2022 <101962834+beez2022@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:24:32 +0800 Subject: [PATCH 1/4] include no_ocr_elements --- unstructured/partition/pdf.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 724145b0b5..4e5bc85367 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -519,6 +519,11 @@ def _partition_pdf_or_image_local( process_file_with_pdfminer, ) + if 'no_ocr_elements' in kwargs.keys(): + no_ocr_elements = kwargs['no_ocr_elements'] + else: + no_ocr_elements = [] + if languages is None: languages = ["eng"] @@ -585,6 +590,7 @@ def _partition_pdf_or_image_local( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + no_ocr_elements=no_ocr_elements, pdf_image_dpi=pdf_image_dpi, ) else: @@ -626,6 +632,7 @@ def _partition_pdf_or_image_local( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + no_ocr_elements=no_ocr_elements, pdf_image_dpi=pdf_image_dpi, ) From 649d1108eab54235d00a050476aa201314061e6f Mon Sep 17 00:00:00 2001 From: beez2022 <101962834+beez2022@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:34:33 +0800 Subject: [PATCH 2/4] added no_ocr_elements ocr.py --- unstructured/partition/pdf_image/ocr.py | 42 ++++++++++++++++--------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 39ca6f995e..28e998e063 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -34,6 +34,8 @@ def process_data_with_ocr( infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, +# (BK) no_ocr_elements store elements that will not be ocr-ed (only works with ocr_mode="individual blocks") + no_ocr_elements: list = [], pdf_image_dpi: int = 200, ) -> "DocumentLayout": """ @@ -75,6 +77,7 @@ def process_data_with_ocr( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + no_ocr_elements=no_ocr_elements pdf_image_dpi=pdf_image_dpi, ) return merged_layouts @@ -89,6 +92,8 @@ def process_file_with_ocr( infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, +# (BK) no_ocr_elements store elements that will not be ocr-ed (only works with ocr_mode="individual blocks") + no_ocr_elements: list = [], pdf_image_dpi: int = 200, ) -> "DocumentLayout": """ @@ -135,6 +140,7 @@ def process_file_with_ocr( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + no_ocr_elements=no_ocr_elements, extracted_regions=extracted_regions, ) merged_page_layouts.append(merged_page_layout) @@ -157,6 +163,7 @@ def process_file_with_ocr( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + no_ocr_elements=no_ocr_elements, extracted_regions=extracted_regions, ) merged_page_layouts.append(merged_page_layout) @@ -175,6 +182,7 @@ def supplement_page_layout_with_ocr( infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, + no_ocr_elements: list=[], extracted_regions: Optional[List["TextRegion"]] = None, ) -> "PageLayout": """ @@ -198,23 +206,27 @@ def supplement_page_layout_with_ocr( elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value: for element in page_layout.elements: if not element.text: - padding = env_config.IMAGE_CROP_PAD - padded_element = pad_element_bboxes(element, padding=padding) - cropped_image = image.crop( - ( - padded_element.bbox.x1, - padded_element.bbox.y1, - padded_element.bbox.x2, - padded_element.bbox.y2, - ), - ) +# any element in no_ocr_elements will not be ocr-ed, it will return a value of - + if element.type in no_ocr_elements: + element.text = "-" + else: + padding = env_config.IMAGE_CROP_PAD + padded_element = pad_element_bboxes(element, padding=padding) + cropped_image = image.crop( + ( + padded_element.bbox.x1, + padded_element.bbox.y1, + padded_element.bbox.x2, + padded_element.bbox.y2, + ), + ) # Note(yuming): instead of getting OCR layout, we just need # the text extraced from OCR for individual elements - text_from_ocr = ocr_agent.get_text_from_image( - cropped_image, - ocr_languages=ocr_languages, - ) - element.text = text_from_ocr + text_from_ocr = ocr_agent.get_text_from_image( + cropped_image, + ocr_languages=ocr_languages, + ) + element.text = text_from_ocr else: raise ValueError( "Invalid OCR mode. Parameter `ocr_mode` " From 33474d1ee2a028a4987e003f6858f3c5a5fd1493 Mon Sep 17 00:00:00 2001 From: ext-seahbeekheng Date: Thu, 6 Jun 2024 14:18:00 +0800 Subject: [PATCH 3/4] added key argument to indicate list of element types that do not need ocr --- unstructured/partition/pdf.py | 10 +++++----- unstructured/partition/pdf_image/ocr.py | 21 +++++++++++---------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 4e5bc85367..f42cd18fe2 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -519,10 +519,10 @@ def _partition_pdf_or_image_local( process_file_with_pdfminer, ) - if 'no_ocr_elements' in kwargs.keys(): - no_ocr_elements = kwargs['no_ocr_elements'] + if 'pdf_skip_ocr_element_types' in kwargs.keys(): + pdf_skip_ocr_element_types = kwargs['pdf_skip_ocr_element_types'] else: - no_ocr_elements = [] + pdf_skip_ocr_element_types = [] if languages is None: languages = ["eng"] @@ -590,7 +590,7 @@ def _partition_pdf_or_image_local( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, - no_ocr_elements=no_ocr_elements, + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, pdf_image_dpi=pdf_image_dpi, ) else: @@ -632,7 +632,7 @@ def _partition_pdf_or_image_local( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, - no_ocr_elements=no_ocr_elements, + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, pdf_image_dpi=pdf_image_dpi, ) diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 28e998e063..fc71252e14 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -34,8 +34,8 @@ def process_data_with_ocr( infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, -# (BK) no_ocr_elements store elements that will not be ocr-ed (only works with ocr_mode="individual blocks") - no_ocr_elements: list = [], +# (BK) pdf_skip_ocr_element_types store elementt types that will not be ocr-ed (only works with ocr_mode="individual blocks") + pdf_skip_ocr_element_types: list = [], pdf_image_dpi: int = 200, ) -> "DocumentLayout": """ @@ -77,7 +77,7 @@ def process_data_with_ocr( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, - no_ocr_elements=no_ocr_elements + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, pdf_image_dpi=pdf_image_dpi, ) return merged_layouts @@ -92,8 +92,8 @@ def process_file_with_ocr( infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, -# (BK) no_ocr_elements store elements that will not be ocr-ed (only works with ocr_mode="individual blocks") - no_ocr_elements: list = [], +# (BK) pdf_skip_ocr_element_types stores element types that will not be ocr-ed (only works with ocr_mode="individual blocks") + pdf_skip_ocr_element_types: list = [], pdf_image_dpi: int = 200, ) -> "DocumentLayout": """ @@ -140,7 +140,7 @@ def process_file_with_ocr( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, - no_ocr_elements=no_ocr_elements, + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, extracted_regions=extracted_regions, ) merged_page_layouts.append(merged_page_layout) @@ -163,7 +163,7 @@ def process_file_with_ocr( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, - no_ocr_elements=no_ocr_elements, + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, extracted_regions=extracted_regions, ) merged_page_layouts.append(merged_page_layout) @@ -182,7 +182,7 @@ def supplement_page_layout_with_ocr( infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, - no_ocr_elements: list=[], + pdf_skip_ocr_element_types: list=[], extracted_regions: Optional[List["TextRegion"]] = None, ) -> "PageLayout": """ @@ -206,8 +206,9 @@ def supplement_page_layout_with_ocr( elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value: for element in page_layout.elements: if not element.text: -# any element in no_ocr_elements will not be ocr-ed, it will return a value of - - if element.type in no_ocr_elements: +# (BK) any element type in pdf_skip_ocr_element_types will not be ocr-ed, it will return a value of - +# if it is image, use the element type obtained from the model (e.g. Picture for Yolox) + if element.type in pdf_skip_ocr_element_types: element.text = "-" else: padding = env_config.IMAGE_CROP_PAD From c8e3f14878723dd554fca49cccc088784dc65798 Mon Sep 17 00:00:00 2001 From: ext-seahbeekheng Date: Wed, 3 Jul 2024 14:00:35 +0800 Subject: [PATCH 4/4] updated changelog.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e77a4db5c..5d659cc777 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ### Enhancements +* **Added kwarg in partition_pdf() to skip ocr for certain element_types**. kWARG is 'pdf_skip_ocr_element_types' it is parameter that contains list of element types that will not be OCR-ed. + ### Features ### Fixes