Skip to content

Commit

Permalink
Remove unsupported chipper model (#3728)
Browse files Browse the repository at this point in the history
The chipper model is no longer supported.
  • Loading branch information
vangheem authored Oct 17, 2024
1 parent 1eceac2 commit b092d45
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 202 deletions.
35 changes: 0 additions & 35 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,41 +153,6 @@ jobs:
make test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
make check-coverage
test_chipper:
strategy:
matrix:
python-version: ["3.10"]
runs-on: ubuntu-latest
env:
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
NLTK_DATA: ${{ github.workspace }}/nltk_data
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Setup virtual environment
uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
- name: Test
env:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
PYTHON: python${{ matrix.python-version }}
NLTK_DATA: ${{ github.workspace }}/nltk_data
run: |
source .venv/bin/activate
sudo apt-get update
sudo apt-get install -y poppler-utils
make install-pandoc install-test
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get update
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make test-chipper CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
test_unit_no_extras:
strategy:
matrix:
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
## 0.16.1-dev0
## 0.16.1-dev1

### Enhancements

### Features

### Fixes

* **Remove unsupported chipper model**
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.

## 0.16.0
Expand Down
9 changes: 2 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,7 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
.PHONY: test
test:
PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "not chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40

.PHONY: test-chipper
test-chipper:
PYTHONPATH=. CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} -m "chipper" --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) ${PYTHON} -m pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40

.PHONY: test-unstructured-api-unit
test-unstructured-api-unit:
Expand Down Expand Up @@ -309,7 +304,7 @@ docker-test:
$(DOCKER_IMAGE) \
bash -c "CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
pytest -m 'not chipper' $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"

.PHONY: docker-smoke-test
docker-smoke-test:
Expand Down
2 changes: 0 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ filterwarnings =
ignore::DeprecationWarning
python_classes = Test Describe
python_functions = test_ it_ they_ but_ and_
markers =
chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
testpaths =
test_unstructured
test_unstructured_ingest
Expand Down
43 changes: 0 additions & 43 deletions test_unstructured/partition/pdf_image/test_chipper.py

This file was deleted.

2 changes: 1 addition & 1 deletion test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def test_partition_pdf_with_model_name_env_var(
assert mock_process.call_args[1]["model_name"] == "checkbox"


@pytest.mark.parametrize("model_name", ["checkbox", "yolox", "chipper"])
@pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
def test_partition_pdf_with_model_name(
monkeypatch,
model_name,
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.1-dev0" # pragma: no cover
__version__ = "0.16.1-dev1" # pragma: no cover
185 changes: 81 additions & 104 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,12 +566,7 @@ def _partition_pdf_or_image_local(

hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
if pdf_image_dpi is None:
pdf_image_dpi = 300 if hi_res_model_name.startswith("chipper") else 200
if (pdf_image_dpi < 300) and (hi_res_model_name.startswith("chipper")):
logger.warning(
"The Chipper model performs better when images are rendered with DPI >= 300 "
f"(currently {pdf_image_dpi}).",
)
pdf_image_dpi = 200

od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
Expand All @@ -588,53 +583,48 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
)

if hi_res_model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
# NOTE(antonio): We shouldn't do PDFMiner with chipper
final_document_layout = inferred_document_layout
else:
extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
extracted_layout = (
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)

if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
ocr_layout_dumper = OCRLayoutDumper()
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
hi_res_model_name=hi_res_model_name,
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
)
ocr_layout_dumper = OCRLayoutDumper()
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
hi_res_model_name=hi_res_model_name,
)

final_document_layout = process_file_with_ocr(
filename,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
final_document_layout = process_file_with_ocr(
filename,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
else:
inferred_document_layout = process_data_with_model(
file,
Expand All @@ -643,62 +633,51 @@ def _partition_pdf_or_image_local(
pdf_image_dpi=pdf_image_dpi,
)

if hi_res_model_name.startswith("chipper"):
# NOTE(alan): We shouldn't do OCR with chipper
# NOTE(antonio): We shouldn't do PDFMiner with chipper
final_document_layout = inferred_document_layout
else:
if hasattr(file, "seek"):
file.seek(0)
if hasattr(file, "seek"):
file.seek(0)

extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
if pdf_text_extractable
else []
)
extracted_layout = (
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else []
)

if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
)
ocr_layout_dumper = OCRLayoutDumper()

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
hi_res_model_name=hi_res_model_name,
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
if not skip_analysis_dump:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
)
extracted_layout_dumper = ExtractedLayoutDumper(
layout=extracted_layout,
)
ocr_layout_dumper = OCRLayoutDumper()

if hasattr(file, "seek"):
file.seek(0)
final_document_layout = process_data_with_ocr(
file,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
hi_res_model_name=hi_res_model_name,
)

# NOTE(alan): starting with v2, chipper sorts the elements itself.
if hi_res_model_name.startswith("chipper") and hi_res_model_name != "chipperv1":
kwargs["sort_mode"] = SORT_MODE_DONT
if hasattr(file, "seek"):
file.seek(0)
final_document_layout = process_data_with_ocr(
file,
merged_document_layout,
extracted_layout=extracted_layout,
is_image=is_image,
infer_table_structure=infer_table_structure,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_layout_dumper=ocr_layout_dumper,
)

final_document_layout = clean_pdfminer_inner_elements(final_document_layout)

Expand Down Expand Up @@ -766,9 +745,7 @@ def _partition_pdf_or_image_local(
" ",
el.text or "",
).strip()
# NOTE(alan): with chipper there are parent elements with no text we don't want to
# filter those out and leave the children orphaned.
if el.text or isinstance(el, PageBreak) or hi_res_model_name.startswith("chipper"):
if el.text or isinstance(el, PageBreak):
out_elements.append(cast(Element, el))

if extract_forms:
Expand Down
9 changes: 1 addition & 8 deletions unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,6 @@ def merge_inferred_with_extracted_layout(
)
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel

# If the model is a chipper model, we don't want to order the
# elements, as they are already ordered
order_elements = not hi_res_model_name.startswith("chipper")

inferred_pages = inferred_document_layout.pages
for i, (inferred_page, extracted_page_layout) in enumerate(
zip(inferred_pages, extracted_layout)
Expand All @@ -206,10 +202,7 @@ def merge_inferred_with_extracted_layout(
**threshold_kwargs,
)

if order_elements:
merged_layout = sort_text_regions(
cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC
)
merged_layout = sort_text_regions(cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC)

elements = []
for layout_el in merged_layout:
Expand Down

0 comments on commit b092d45

Please sign in to comment.