diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e77a4db5c..5d659cc777 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ### Enhancements +* **Added kwarg in partition_pdf() to skip ocr for certain element_types**. kWARG is 'pdf_skip_ocr_element_types' it is parameter that contains list of element types that will not be OCR-ed. + ### Features ### Fixes diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 724145b0b5..f42cd18fe2 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -519,6 +519,11 @@ def _partition_pdf_or_image_local( process_file_with_pdfminer, ) + if 'pdf_skip_ocr_element_types' in kwargs.keys(): + pdf_skip_ocr_element_types = kwargs['pdf_skip_ocr_element_types'] + else: + pdf_skip_ocr_element_types = [] + if languages is None: languages = ["eng"] @@ -585,6 +590,7 @@ def _partition_pdf_or_image_local( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, pdf_image_dpi=pdf_image_dpi, ) else: @@ -626,6 +632,7 @@ def _partition_pdf_or_image_local( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, pdf_image_dpi=pdf_image_dpi, ) diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 39ca6f995e..fc71252e14 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -34,6 +34,8 @@ def process_data_with_ocr( infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, +# (BK) pdf_skip_ocr_element_types store elementt types that will not be ocr-ed (only works with ocr_mode="individual blocks") + pdf_skip_ocr_element_types: list = [], pdf_image_dpi: int = 200, ) -> "DocumentLayout": """ @@ -75,6 +77,7 @@ def process_data_with_ocr( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, pdf_image_dpi=pdf_image_dpi, ) return merged_layouts @@ -89,6 +92,8 @@ def process_file_with_ocr( infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, +# (BK) pdf_skip_ocr_element_types stores element types that will not be ocr-ed (only works with ocr_mode="individual blocks") + pdf_skip_ocr_element_types: list = [], pdf_image_dpi: int = 200, ) -> "DocumentLayout": """ @@ -135,6 +140,7 @@ def process_file_with_ocr( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, extracted_regions=extracted_regions, ) merged_page_layouts.append(merged_page_layout) @@ -157,6 +163,7 @@ def process_file_with_ocr( infer_table_structure=infer_table_structure, ocr_languages=ocr_languages, ocr_mode=ocr_mode, + pdf_skip_ocr_element_types=pdf_skip_ocr_element_types, extracted_regions=extracted_regions, ) merged_page_layouts.append(merged_page_layout) @@ -175,6 +182,7 @@ def supplement_page_layout_with_ocr( infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, + pdf_skip_ocr_element_types: list=[], extracted_regions: Optional[List["TextRegion"]] = None, ) -> "PageLayout": """ @@ -198,23 +206,28 @@ def supplement_page_layout_with_ocr( elif ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value: for element in page_layout.elements: if not element.text: - padding = env_config.IMAGE_CROP_PAD - padded_element = pad_element_bboxes(element, padding=padding) - cropped_image = image.crop( - ( - padded_element.bbox.x1, - padded_element.bbox.y1, - padded_element.bbox.x2, - padded_element.bbox.y2, - ), - ) +# (BK) any element type in pdf_skip_ocr_element_types will not be ocr-ed, it will return a value of - +# if it is image, use the element type obtained from the model (e.g. Picture for Yolox) + if element.type in pdf_skip_ocr_element_types: + element.text = "-" + else: + padding = env_config.IMAGE_CROP_PAD + padded_element = pad_element_bboxes(element, padding=padding) + cropped_image = image.crop( + ( + padded_element.bbox.x1, + padded_element.bbox.y1, + padded_element.bbox.x2, + padded_element.bbox.y2, + ), + ) # Note(yuming): instead of getting OCR layout, we just need # the text extraced from OCR for individual elements - text_from_ocr = ocr_agent.get_text_from_image( - cropped_image, - ocr_languages=ocr_languages, - ) - element.text = text_from_ocr + text_from_ocr = ocr_agent.get_text_from_image( + cropped_image, + ocr_languages=ocr_languages, + ) + element.text = text_from_ocr else: raise ValueError( "Invalid OCR mode. Parameter `ocr_mode` "