Remove unsupported chipper model (#3728)

The chipper model is no longer supported.
Unstructured-IO · Oct 17, 2024 · b092d45 · b092d45
1 parent 1eceac2
commit b092d45
Show file tree

Hide file tree

Showing 9 changed files with 88 additions and 202 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -153,41 +153,6 @@ jobs:
         make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
         make check-coverage
 
-  test_chipper:
-    strategy:
-      matrix:
-        python-version: ["3.10"]
-    runs-on: ubuntu-latest
-    env:
-      UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
-    needs: [setup, lint]
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Setup virtual environment
-      uses: ./.github/actions/base-cache
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Test
-      env:
-        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
-        PYTHON: python${{ matrix.python-version }}
-        NLTK_DATA: ${{ github.workspace }}/nltk_data
-      run: |
-        source .venv/bin/activate
-        sudo apt-get update
-        sudo apt-get install -y poppler-utils
-        make install-pandoc install-test
-        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get update
-        sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
-        tesseract --version
-        make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
-
   test_unit_no_extras:
     strategy:
       matrix:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.16.1-dev0
+## 0.16.1-dev1
 
 ### Enhancements
 
 ### Features
 
 ### Fixes
 
+* **Remove unsupported chipper model**
 * **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
 
 ## 0.16.0

diff --git a/Makefile b/Makefile
@@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
 .PHONY: test
 test:
 	PYTHONPATH=. CI=$(CI) \
-	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
-
-.PHONY: test-chipper
-test-chipper:
-	PYTHONPATH=. CI=$(CI) \
-	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
+	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
 
 .PHONY: test-unstructured-api-unit
 test-unstructured-api-unit:
@@ -309,7 +304,7 @@ docker-test:
 	$(DOCKER_IMAGE) \
 	bash -c "CI=$(CI) \
 	UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
-	pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
+	pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
 
 .PHONY: docker-smoke-test
 docker-smoke-test:

diff --git a/setup.cfg b/setup.cfg
@@ -15,8 +15,6 @@ filterwarnings =
     ignore::DeprecationWarning
 python_classes = Test Describe
 python_functions = test_ it_ they_ but_ and_
-markers =
-    chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
 testpaths =
     test_unstructured
     test_unstructured_ingest

diff --git a/test_unstructured/partition/pdf_image/test_chipper.py b/test_unstructured/partition/pdf_image/test_chipper.py
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -218,7 +218,7 @@ def test_partition_pdf_with_model_name_env_var(
         assert mock_process.call_args[1]["model_name"] == "checkbox"
 
 
-@pytest.mark.parametrize("model_name", ["checkbox", "yolox", "chipper"])
+@pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
 def test_partition_pdf_with_model_name(
     monkeypatch,
     model_name,

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.1-dev0"  # pragma: no cover
+__version__ = "0.16.1-dev1"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -566,12 +566,7 @@ def _partition_pdf_or_image_local(
 
     hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
     if pdf_image_dpi is None:
-        pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
-    if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
-        logger.warning(
-            "The Chipper model performs better when images are rendered with DPI >= 300 "
-            f"(currently {pdf_image_dpi}).",
-        )
+        pdf_image_dpi = 200
 
     od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
     extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
@@ -588,53 +583,48 @@ def _partition_pdf_or_image_local(
             pdf_image_dpi=pdf_image_dpi,
         )
 
-        if hi_res_model_name.startswith("chipper"):
-            # NOTE(alan): We shouldn't do OCR with chipper
-            # NOTE(antonio): We shouldn't do PDFMiner with chipper
-            final_document_layout = inferred_document_layout
-        else:
-            extracted_layout = (
-                process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
-                if pdf_text_extractable
-                else []
-            )
+        extracted_layout = (
+            process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
+            if pdf_text_extractable
+            else []
+        )
 
-            if analysis:
-                if not analyzed_image_output_dir_path:
-                    if env_config.GLOBAL_WORKING_DIR_ENABLED:
-                        analyzed_image_output_dir_path = str(
-                            Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
-                        )
-                    else:
-                        analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
-                os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
-                if not skip_analysis_dump:
-                    od_model_layout_dumper = ObjectDetectionLayoutDumper(
-                        layout=inferred_document_layout,
-                        model_name=hi_res_model_name,
-                    )
-                    extracted_layout_dumper = ExtractedLayoutDumper(
-                        layout=extracted_layout,
+        if analysis:
+            if not analyzed_image_output_dir_path:
+                if env_config.GLOBAL_WORKING_DIR_ENABLED:
+                    analyzed_image_output_dir_path = str(
+                        Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
                     )
-                    ocr_layout_dumper = OCRLayoutDumper()
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = merge_inferred_with_extracted_layout(
-                inferred_document_layout=inferred_document_layout,
-                extracted_layout=extracted_layout,
-                hi_res_model_name=hi_res_model_name,
-            )
+                else:
+                    analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
+            os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
+            if not skip_analysis_dump:
+                od_model_layout_dumper = ObjectDetectionLayoutDumper(
+                    layout=inferred_document_layout,
+                    model_name=hi_res_model_name,
+                )
+                extracted_layout_dumper = ExtractedLayoutDumper(
+                    layout=extracted_layout,
+                )
+                ocr_layout_dumper = OCRLayoutDumper()
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+            hi_res_model_name=hi_res_model_name,
+        )
 
-            final_document_layout = process_file_with_ocr(
-                filename,
-                merged_document_layout,
-                extracted_layout=extracted_layout,
-                is_image=is_image,
-                infer_table_structure=infer_table_structure,
-                ocr_languages=ocr_languages,
-                ocr_mode=ocr_mode,
-                pdf_image_dpi=pdf_image_dpi,
-                ocr_layout_dumper=ocr_layout_dumper,
-            )
+        final_document_layout = process_file_with_ocr(
+            filename,
+            merged_document_layout,
+            extracted_layout=extracted_layout,
+            is_image=is_image,
+            infer_table_structure=infer_table_structure,
+            ocr_languages=ocr_languages,
+            ocr_mode=ocr_mode,
+            pdf_image_dpi=pdf_image_dpi,
+            ocr_layout_dumper=ocr_layout_dumper,
+        )
     else:
         inferred_document_layout = process_data_with_model(
             file,
@@ -643,62 +633,51 @@ def _partition_pdf_or_image_local(
             pdf_image_dpi=pdf_image_dpi,
         )
 
-        if hi_res_model_name.startswith("chipper"):
-            # NOTE(alan): We shouldn't do OCR with chipper
-            # NOTE(antonio): We shouldn't do PDFMiner with chipper
-            final_document_layout = inferred_document_layout
-        else:
-            if hasattr(file, "seek"):
-                file.seek(0)
+        if hasattr(file, "seek"):
+            file.seek(0)
 
-            extracted_layout = (
-                process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
-                if pdf_text_extractable
-                else []
-            )
+        extracted_layout = (
+            process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
+        )
 
-            if analysis:
-                if not analyzed_image_output_dir_path:
-                    if env_config.GLOBAL_WORKING_DIR_ENABLED:
-                        analyzed_image_output_dir_path = str(
-                            Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
-                        )
-                    else:
-                        analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
-                if not skip_analysis_dump:
-                    od_model_layout_dumper = ObjectDetectionLayoutDumper(
-                        layout=inferred_document_layout,
-                        model_name=hi_res_model_name,
+        if analysis:
+            if not analyzed_image_output_dir_path:
+                if env_config.GLOBAL_WORKING_DIR_ENABLED:
+                    analyzed_image_output_dir_path = str(
+                        Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
                     )
-                    extracted_layout_dumper = ExtractedLayoutDumper(
-                        layout=extracted_layout,
-                    )
-                    ocr_layout_dumper = OCRLayoutDumper()
-
-            # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
-            merged_document_layout = merge_inferred_with_extracted_layout(
-                inferred_document_layout=inferred_document_layout,
-                extracted_layout=extracted_layout,
-                hi_res_model_name=hi_res_model_name,
-            )
+                else:
+                    analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
+            if not skip_analysis_dump:
+                od_model_layout_dumper = ObjectDetectionLayoutDumper(
+                    layout=inferred_document_layout,
+                    model_name=hi_res_model_name,
+                )
+                extracted_layout_dumper = ExtractedLayoutDumper(
+                    layout=extracted_layout,
+                )
+                ocr_layout_dumper = OCRLayoutDumper()
 
-            if hasattr(file, "seek"):
-                file.seek(0)
-            final_document_layout = process_data_with_ocr(
-                file,
-                merged_document_layout,
-                extracted_layout=extracted_layout,
-                is_image=is_image,
-                infer_table_structure=infer_table_structure,
-                ocr_languages=ocr_languages,
-                ocr_mode=ocr_mode,
-                pdf_image_dpi=pdf_image_dpi,
-                ocr_layout_dumper=ocr_layout_dumper,
-            )
+        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
+        merged_document_layout = merge_inferred_with_extracted_layout(
+            inferred_document_layout=inferred_document_layout,
+            extracted_layout=extracted_layout,
+            hi_res_model_name=hi_res_model_name,
+        )
 
-    # NOTE(alan): starting with v2, chipper sorts the elements itself.
-    if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
-        kwargs["sort_mode"] = SORT_MODE_DONT
+        if hasattr(file, "seek"):
+            file.seek(0)
+        final_document_layout = process_data_with_ocr(
+            file,
+            merged_document_layout,
+            extracted_layout=extracted_layout,
+            is_image=is_image,
+            infer_table_structure=infer_table_structure,
+            ocr_languages=ocr_languages,
+            ocr_mode=ocr_mode,
+            pdf_image_dpi=pdf_image_dpi,
+            ocr_layout_dumper=ocr_layout_dumper,
+        )
 
     final_document_layout = clean_pdfminer_inner_elements(final_document_layout)
 
@@ -766,9 +745,7 @@ def _partition_pdf_or_image_local(
                 " ",
                 el.text or "",
             ).strip()
-            # NOTE(alan): with chipper there are parent elements with no text we don't want to
-            # filter those out and leave the children orphaned.
-            if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
+            if el.text or isinstance(el, PageBreak):
                 out_elements.append(cast(Element, el))
 
     if extract_forms:

diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -176,10 +176,6 @@ def merge_inferred_with_extracted_layout(
     )
     from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
 
-    # If the model is a chipper model, we don't want to order the
-    # elements, as they are already ordered
-    order_elements = not hi_res_model_name.startswith("chipper")
-
     inferred_pages = inferred_document_layout.pages
     for i, (inferred_page, extracted_page_layout) in enumerate(
         zip(inferred_pages, extracted_layout)
@@ -206,10 +202,7 @@ def merge_inferred_with_extracted_layout(
             **threshold_kwargs,
         )
 
-        if order_elements:
-            merged_layout = sort_text_regions(
-                cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC
-            )
+        merged_layout = sort_text_regions(cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC)
 
         elements = []
         for layout_el in merged_layout:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.16.1-dev0" # pragma: no cover
		__version__ = "0.16.1-dev1" # pragma: no cover