diff --git a/CHANGELOG.md b/CHANGELOG.md index 959f6c581b..d13d859802 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev2 +## 0.16.12-dev3 ### Enhancements @@ -9,6 +9,7 @@ ### Fixes - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. +- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. ## 0.16.11 diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 933882f9e2..c1f7ad1f8d 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -14,15 +14,14 @@ LogCaptureFixture, Mock, example_doc_path, - function_mock, patch, property_mock, ) from unstructured.file_utils.filetype import ( _FileTypeDetectionContext, - _OleFileDifferentiator, + _OleFileDetector, _TextFileDifferentiator, - _ZipFileDifferentiator, + _ZipFileDetector, detect_filetype, is_json_processable, ) @@ -31,7 +30,41 @@ is_in_docker = os.path.exists("/.dockerenv") # ================================================================================================ -# STRATEGY #1 - CONTENT-TYPE ASSERTED IN CALL +# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES) +# ================================================================================================ + + +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.DOC, "simple.doc"), + (FileType.DOCX, "simple.docx"), + (FileType.EPUB, "winter-sports.epub"), + (FileType.ODT, "simple.odt"), + (FileType.PPT, "fake-power-point.ppt"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.XLS, "tests-example.xls"), + (FileType.XLSX, "stanley-cups.xlsx"), + ], +) +def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direct_inspection( + file_name: str, expected_value: FileType, ctx_mime_type_: Mock +): + # -- disable other strategies; no content-type, guessed MIME-type or extension -- + ctx_mime_type_.return_value = None + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + + file_type = detect_filetype(file=file) + + # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not + # -- fall back to MIME-type guessing for any of these test cases. + ctx_mime_type_.assert_not_called() + assert file_type == expected_value + + +# ================================================================================================ +# STRATEGY #2 - CONTENT-TYPE ASSERTED IN CALL # ================================================================================================ @@ -40,41 +73,21 @@ [ (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), (FileType.CSV, "stanley-cups.csv", "text/csv"), - (FileType.DOC, "simple.doc", "application/msword"), - ( - FileType.DOCX, - "simple.docx", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ), (FileType.EML, "eml/fake-email.eml", "message/rfc822"), - (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), (FileType.HEIC, "img/DA-1p.heic", "image/heic"), (FileType.HTML, "example-10k-1p.html", "text/html"), (FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.MD, "README.md", "text/markdown"), - (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), (FileType.ORG, "README.org", "text/org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), (FileType.PNG, "img/DA-1p.png", "image/png"), - (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), - ( - FileType.PPTX, - "fake-power-point.pptx", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ), (FileType.RST, "README.rst", "text/x-rst"), (FileType.RTF, "fake-doc.rtf", "text/rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), (FileType.TSV, "stanley-cups.tsv", "text/tsv"), (FileType.TXT, "norwich-city.txt", "text/plain"), (FileType.WAV, "CantinaBand3.wav", "audio/wav"), - (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), - ( - FileType.XLSX, - "stanley-cups.xlsx", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ), (FileType.XML, "factbook.xml", "application/xml"), (FileType.ZIP, "simple.zip", "application/zip"), ], @@ -82,13 +95,13 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type( file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock ): - # -- disable strategy #2, leaving only asserted content-type and extension -- + # -- disable mime-guessing leaving only asserted content-type and extension -- ctx_mime_type_.return_value = None file_type = detect_filetype(example_doc_path(file_name), content_type=content_type) - # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not - # -- fall back to strategy 2 for any of these test cases. + # -- Content-type strategy should not need to refer to guessed MIME-type and detection should + # not -- fall back to strategy 2 for any of these test cases. ctx_mime_type_.assert_not_called() assert file_type == expected_value @@ -98,41 +111,21 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte [ (FileType.BMP, "img/bmp_24.bmp", "image/bmp"), (FileType.CSV, "stanley-cups.csv", "text/csv"), - (FileType.DOC, "simple.doc", "application/msword"), - ( - FileType.DOCX, - "simple.docx", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ), (FileType.EML, "eml/fake-email.eml", "message/rfc822"), - (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), (FileType.HEIC, "img/DA-1p.heic", "image/heic"), (FileType.HTML, "example-10k-1p.html", "text/html"), (FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.MD, "README.md", "text/markdown"), - (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), (FileType.ORG, "README.org", "text/org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), (FileType.PNG, "img/DA-1p.png", "image/png"), - (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), - ( - FileType.PPTX, - "fake-power-point.pptx", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ), (FileType.RST, "README.rst", "text/x-rst"), (FileType.RTF, "fake-doc.rtf", "text/rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"), (FileType.TSV, "stanley-cups.tsv", "text/tsv"), (FileType.TXT, "norwich-city.txt", "text/plain"), (FileType.WAV, "CantinaBand3.wav", "audio/wav"), - (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), - ( - FileType.XLSX, - "stanley-cups.xlsx", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ), (FileType.XML, "factbook.xml", "application/xml"), (FileType.ZIP, "simple.zip", "application/zip"), ], @@ -140,93 +133,22 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_content_type( file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock ): - # -- disable strategy #2 (guessed mime-type) -- - ctx_mime_type_.return_value = None - # -- disable strategy #3 (filename extension) by supplying no source of file name -- - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - - file_type = detect_filetype(file=file, content_type=content_type) - - # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not - # -- fall-back to strategy 2 for any of these test cases. - ctx_mime_type_.assert_not_called() - assert file_type is expected_value - - -@pytest.mark.parametrize( - ("expected_value", "file_name"), - [ - (FileType.DOCX, "simple.docx"), - (FileType.PPTX, "fake-power-point.pptx"), - (FileType.XLSX, "stanley-cups.xlsx"), - ], -) -@pytest.mark.parametrize( - "content_type", - [ - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ], -) -def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_content_type( - file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock -): - # -- disable strategies 2 & 3, content-type strategy should get this on its own -- + # -- disable mime-guessing -- ctx_mime_type_.return_value = None + # -- disable filename extension mapping by supplying no source of file name -- with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) file_type = detect_filetype(file=file, content_type=content_type) - # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not - # -- fall-back to strategy 2 for any of these test cases. - ctx_mime_type_.assert_not_called() - assert file_type is expected_value - - -@pytest.mark.parametrize( - ("expected_value", "file_name"), - [ - (FileType.DOC, "simple.doc"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.XLS, "tests-example.xls"), - ], -) -@pytest.mark.parametrize( - "content_type", - [ - "application/msword", - "application/vnd.ms-outlook", - "application/vnd.ms-powerpoint", - "application/vnd.ms-excel", - "anything/else", - ], -) -def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type( - file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock -): - """Fixes wrong XLS asserted as DOC, PPT, etc. - - Asserted content-type can be anything except `None` and differentiator will fix it if the file - is DOC, PPT, or XLS type. - """ - # -- disable strategies 2 & 3, content-type strategy should get this on its own -- - ctx_mime_type_.return_value = None - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - - file_type = detect_filetype(file=file, content_type=content_type) - - # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not - # -- fall-back to strategy 2 for any of these test cases. + # -- Content-type strategy should not need to refer to guessed MIME-type and detection should + # -- not fall-back to strategy 2 for any of these test cases. ctx_mime_type_.assert_not_called() assert file_type is expected_value # ================================================================================================ -# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC +# STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY # ================================================================================================ @@ -237,31 +159,16 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_ (FileType.CSV, "stanley-cups.csv", "text/csv"), (FileType.CSV, "stanley-cups.csv", "application/csv"), (FileType.CSV, "stanley-cups.csv", "application/x-csv"), - (FileType.DOC, "simple.doc", "application/msword"), - ( - FileType.DOCX, - "simple.docx", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ), (FileType.EML, "eml/fake-email.eml", "message/rfc822"), - (FileType.EPUB, "winter-sports.epub", "application/epub"), - (FileType.EPUB, "winter-sports.epub", "application/epub+zip"), (FileType.HEIC, "img/DA-1p.heic", "image/heic"), (FileType.HTML, "example-10k-1p.html", "text/html"), (FileType.JPG, "img/example.jpg", "image/jpeg"), (FileType.JSON, "spring-weather.html.json", "application/json"), (FileType.MD, "README.md", "text/markdown"), (FileType.MD, "README.md", "text/x-markdown"), - (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"), (FileType.ORG, "README.org", "text/org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"), (FileType.PNG, "img/DA-1p.png", "image/png"), - (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"), - ( - FileType.PPTX, - "fake-power-point.pptx", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ), (FileType.RST, "README.rst", "text/x-rst"), (FileType.RTF, "fake-doc.rtf", "text/rtf"), (FileType.RTF, "fake-doc.rtf", "application/rtf"), @@ -270,18 +177,11 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_ (FileType.TXT, "norwich-city.txt", "text/plain"), (FileType.TXT, "simple.yaml", "text/yaml"), (FileType.WAV, "CantinaBand3.wav", "audio/wav"), - (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"), - ( - FileType.XLSX, - "stanley-cups.xlsx", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ), (FileType.XML, "factbook.xml", "application/xml"), (FileType.XML, "factbook.xml", "text/xml"), - (FileType.ZIP, "simple.zip", "application/zip"), ], ) -def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_recognized_mime_type( +def test_it_detects_correct_file_type_by_guessed_MIME_when_libmagic_guesses_recognized_mime_type( file_name: str, mime_type: str, expected_value: FileType, ctx_mime_type_: Mock ): # -- libmagic guesses a MIME-type mapped to a `FileType` -- @@ -290,7 +190,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) - # -- disable strategy #1 by not asserting a content_type in the call -- + # -- disable content-type strategy by not asserting a content_type in the call -- file_type = detect_filetype(file=file) # -- ctx.mime_type may be referenced multiple times, but at least once -- @@ -303,30 +203,22 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec [ (FileType.BMP, "img/bmp_24.bmp"), (FileType.CSV, "stanley-cups.csv"), - (FileType.DOC, "simple.doc"), - (FileType.DOCX, "simple.docx"), (FileType.EML, "eml/fake-email.eml"), - (FileType.EPUB, "winter-sports.epub"), (FileType.HEIC, "img/DA-1p.heic"), (FileType.HTML, "ideas-page.html"), (FileType.JPG, "img/example.jpg"), (FileType.JSON, "spring-weather.html.json"), - (FileType.ODT, "simple.odt"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PNG, "img/DA-1p.png"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.PPTX, "fake-power-point.pptx"), (FileType.RTF, "fake-doc.rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.TXT, "norwich-city.txt"), (FileType.WAV, "CantinaBand3.wav"), - (FileType.XLS, "tests-example.xls"), - (FileType.XLSX, "stanley-cups.xlsx"), (FileType.XML, "factbook.xml"), (FileType.ZIP, "simple.zip"), ], ) -def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_type_for_itself( +def test_it_detects_most_file_types_using_mime_guessing_when_libmagic_guesses_mime_type_for_itself( file_name: str, expected_value: FileType ): """Does not work for all types, in particular: @@ -339,90 +231,26 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_ - ORG is identified as TXT - RST is identified as TXT """ - # -- disable strategy #1 by not asserting a content_type in the call -- - # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- + # -- disable content-type strategy by not asserting a content_type in the call -- + # -- disable extension-mapping strategy by passing file-like object with no `.name` attribute -- with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) assert detect_filetype(file=file) is expected_value -@pytest.mark.parametrize( - ("expected_value", "file_name"), - [ - (FileType.DOC, "simple.doc"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.XLS, "tests-example.xls"), - ], -) -@pytest.mark.parametrize( - "guessed_mime_type", - [ - "application/msword", - "application/vnd.ms-excel", - "application/vnd.ms-outlook", - "application/vnd.ms-powerpoint", - "application/x-ole-storage", - "anything/else", - ], -) -def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type( - file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock -): - """Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc. - - It's better than that actually, the OLE differentiator will get the right file-type for any DOC, - PPT, XLS, or MSG file, regardless of guessed MIME-type. - """ - ctx_mime_type_.return_value = guessed_mime_type - # -- disable strategy 3 by not providing a file-name source -- - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - - # -- disable strategy 1 by not asserting a content-type -- - file_type = detect_filetype(file=file) - - ctx_mime_type_.assert_called_with() - assert file_type is expected_value - - -@pytest.mark.parametrize( - ("filename", "mime_type", "expected"), - [ - ("fake.doc", "application/vnd.ms-excel", FileType.DOC), - ("fake-power-point.ppt", "application/vnd.ms-excel", FileType.PPT), - ("tests-example.xls", "application/msword", FileType.XLS), - ("fake-email.msg", "application/vnd.ms-excel", FileType.MSG), - ], -) -def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, expected): - def _guess_mime(*args, **kwargs): - return mime_type - - with patch("filetype.guess_mime", _guess_mime): - detect_filetype(example_doc_path(filename)) == expected - - @pytest.mark.parametrize( ("expected_value", "file_name"), [ # -- `filetype` lib recognizes all these binary file-types -- (FileType.BMP, "img/bmp_24.bmp"), - (FileType.DOC, "simple.doc"), - (FileType.DOCX, "simple.docx"), - (FileType.EPUB, "winter-sports.epub"), (FileType.HEIC, "img/DA-1p.heic"), (FileType.JPG, "img/example.jpg"), - (FileType.ODT, "simple.odt"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PNG, "img/DA-1p.png"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.PPTX, "fake-power-point.pptx"), (FileType.RTF, "fake-doc.rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.WAV, "CantinaBand3.wav"), - (FileType.XLS, "tests-example.xls"), - (FileType.XLSX, "stanley-cups.xlsx"), (FileType.ZIP, "simple.zip"), # -- but it doesn't recognize textual file-types at all -- (FileType.UNK, "stanley-cups.csv"), @@ -435,11 +263,9 @@ def _guess_mime(*args, **kwargs): (FileType.UNK, "stanley-cups.tsv"), (FileType.UNK, "norwich-city.txt"), (FileType.UNK, "factbook.xml"), - # -- and it doesn't recognize MSG files -- - (FileType.UNK, "fake-email.msg"), ], ) -def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailable( +def test_strategy_mime_guessing_can_detect_only_binary_file_types_when_libmagic_is_unavailable( file_name: str, expected_value: FileType, LIBMAGIC_AVAILABLE_False: bool ): """File-type is detected using `filetype` library when libmagic is not available. @@ -447,7 +273,7 @@ def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailab `filetype.guess_mime()` does a good job on binary file types (PDF, images, legacy MS-Office), but doesn't even try to guess textual file-types. """ - # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute -- + # -- disable detection by extension by passing file-like object with no `.name` attribute -- with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) # -- simulate libmagic is not available -- @@ -470,7 +296,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( # ================================================================================================ -# STRATEGY #3 - MAP FILENAME EXTENSION TO FILETYPE +# STRATEGY #4 - MAP FILENAME EXTENSION TO FILETYPE # ================================================================================================ @@ -479,35 +305,25 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed( [ (FileType.BMP, "img/bmp_24.bmp"), (FileType.CSV, "stanley-cups.csv"), - (FileType.DOC, "simple.doc"), - (FileType.DOCX, "simple.docx"), (FileType.EML, "eml/fake-email.eml"), - (FileType.EPUB, "winter-sports.epub"), (FileType.HEIC, "img/DA-1p.heic"), (FileType.HTML, "example-10k-1p.html"), (FileType.JPG, "img/example.jpg"), (FileType.JSON, "spring-weather.html.json"), (FileType.MD, "README.md"), - (FileType.MSG, "fake-email.msg"), - (FileType.ODT, "simple.odt"), (FileType.ORG, "README.org"), (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"), (FileType.PNG, "img/DA-1p.png"), - (FileType.PPT, "fake-power-point.ppt"), - (FileType.PPTX, "fake-power-point.pptx"), (FileType.RST, "README.rst"), (FileType.RTF, "fake-doc.rtf"), (FileType.TIFF, "img/layout-parser-paper-fast.tiff"), (FileType.TSV, "stanley-cups.tsv"), (FileType.TXT, "norwich-city.txt"), (FileType.WAV, "CantinaBand3.wav"), - (FileType.XLS, "tests-example.xls"), - (FileType.XLSX, "stanley-cups.xlsx"), (FileType.XML, "factbook.xml"), - (FileType.ZIP, "simple.zip"), ], ) -def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_file_type( +def test_it_detects_correct_file_type_from_extension_when_that_maps_to_a_file_type( file_name: str, expected_value: FileType, ctx_mime_type_: Mock ): # -- disable strategy #2 by making libmagic always guess `None` -- @@ -525,10 +341,8 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil @pytest.mark.parametrize( ("expected_value", "file_name", "mime_type"), [ - (FileType.BMP, "img/bmp_24.bmp", "application/zip"), - (FileType.DOC, "simple.doc", None), - (FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"), - (FileType.MSG, "fake-email.msg", "application/octet-stream"), + (FileType.BMP, "img/bmp_24.bmp", "application/octet-stream"), + (FileType.HEIC, "img/DA-1p.heic", "application/octet-stream"), ], ) def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( @@ -547,6 +361,12 @@ def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail( # ================================================================================================ +@pytest.mark.parametrize("mime_type", [FileType.XLS.mime_type, FileType.XLSX.mime_type]) +def test_it_ignores_asserted_XLS_content_type_when_file_is_CSV(mime_type: str): + file_path = example_doc_path("stanley-cups.csv") + assert detect_filetype(file_path, content_type=mime_type) == FileType.CSV + + @pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"]) @pytest.mark.parametrize("extension", [".html", ".htm"]) def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension( @@ -563,39 +383,6 @@ def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extensi assert file_type is FileType.HTML -@pytest.mark.parametrize( - "mime_type", - [ - "application/octet-stream", - "application/zip", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ], -) -@pytest.mark.parametrize( - ("expected_value", "file_name"), - [ - (FileType.DOCX, "simple.docx"), - (FileType.PPTX, "fake-power-point.pptx"), - (FileType.XLSX, "stanley-cups.xlsx"), - (FileType.ZIP, "simple.zip"), - ], -) -def test_it_differentiates_files_when_libmagic_guesses_octet_stream_zip_or_modern_ms_office( - mime_type: str, file_name: str, expected_value: FileType, ctx_mime_type_: Mock -): - ctx_mime_type_.return_value = mime_type - # -- disable extension-based strategy #3 -- - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - - file_type = detect_filetype(file=file) - - ctx_mime_type_.assert_called_with() - assert file_type is expected_value - - @pytest.mark.parametrize( ("mime_type", "file_name"), [ @@ -1000,29 +787,8 @@ def mime_type_prop_(self, request: FixtureRequest): return property_mock(request, _FileTypeDetectionContext, "mime_type") -class Describe_OleFileDifferentiator: - """Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`.""" - - # -- .applies() --------------------------------------------- - - def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): - """The constructor determines whether this differentiator is applicable. - - It returns an instance only when differentiating a CFBF file-type is required, which it - judges by inspecting the initial bytes of the file for the CFBF magic-bytes. - """ - ctx = _FileTypeDetectionContext(example_doc_path("simple.doc")) - - differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar") - - assert differentiator is not None - assert isinstance(differentiator, _OleFileDifferentiator) - - def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self): - ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub")) - assert _OleFileDifferentiator.applies(ctx, "application/epub") is None - - # -- .file_type --------------------------------------------- +class Describe_OleFileDetector: + """Unit-test suite for `unstructured.file_utils.filetype._OleFileDetector`.""" @pytest.mark.parametrize( ("file_name", "expected_value"), @@ -1034,59 +800,15 @@ def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_t ("README.org", None), ], ) - def it_distinguishes_the_file_type_of_applicable_OLE_files( + def it_distinguishes_the_file_type_of_applicable_CFB_files( self, file_name: str, expected_value: FileType | None ): # -- no file-name available, just to make sure we're not relying on an extension -- with open(example_doc_path(file_name), "rb") as f: file = io.BytesIO(f.read()) ctx = _FileTypeDetectionContext(file=file) - differentiator = _OleFileDifferentiator(ctx) - assert differentiator.file_type is expected_value - - @pytest.mark.parametrize( - ("file_name", "expected_value"), - [ - ("simple.doc", FileType.DOC), - ("fake-power-point.ppt", FileType.PPT), - ("tests-example.xls", FileType.XLS), - ("fake-email.msg", FileType.MSG), - ], - ) - def it_distinguishes_the_file_type_of_applicable_OLE_files_from_storage_content( - self, file_name: str, expected_value: FileType | None - ): - # -- no file-name available, just to make sure we're not relying on an extension -- - with open(example_doc_path(file_name), "rb") as f: - file = io.BytesIO(f.read()) - ctx = _FileTypeDetectionContext(file=file) - differentiator = _OleFileDifferentiator(ctx) - - assert differentiator._check_ole_file_type(ctx) is expected_value - - def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime( - self, guess_mime_: Mock - ): - guess_mime_.return_value = None - # -- no file-name available, just to make sure we're not relying on an extension -- - with open(example_doc_path("fake-email.msg"), "rb") as f: - file = io.BytesIO(f.read()) - ctx = _FileTypeDetectionContext(file=file) - differentiator = _OleFileDifferentiator(ctx) - # -- force method to return None to trigger the mime type being guessed - differentiator._check_ole_file_type = lambda ctx: None - - file_type = differentiator.file_type - - guess_mime_.assert_called_once_with(file) - assert file_type is None - - # -- fixtures -------------------------------------------------------------------------------- - - @pytest.fixture - def guess_mime_(self, request: FixtureRequest): - return function_mock(request, "unstructured.file_utils.filetype.ft.guess_mime") + assert _OleFileDetector.file_type(ctx) is expected_value class Describe_TextFileDifferentiator: @@ -1164,33 +886,15 @@ def it_distinguishes_a_JSON_file_from_other_text_files( assert differentiator._is_json is expected_value -class Describe_ZipFileDifferentiator: - """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDifferentiator`.""" - - # -- .applies() --------------------------------------------- - - def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self): - """The constructor determines whether this differentiator is applicable. - - It returns an instance only when differentiating a zip file-type is required, which it can - judge from the mime-type provided by the context (`ctx`). - """ - ctx = _FileTypeDetectionContext(example_doc_path("simple.docx")) - - differentiator = _ZipFileDifferentiator.applies(ctx, "application/zip") - - assert isinstance(differentiator, _ZipFileDifferentiator) - - def and_it_returns_None_when_zip_differentiation_does_not_apply_to_the_detection_context(self): - ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt")) - assert _ZipFileDifferentiator.applies(ctx, "application/epub") is None - - # -- .file_type --------------------------------------------- +class Describe_ZipFileDetector: + """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDetector`.""" @pytest.mark.parametrize( ("file_name", "expected_value"), [ ("simple.docx", FileType.DOCX), + ("winter-sports.epub", FileType.EPUB), + ("simple.odt", FileType.ODT), ("picture.pptx", FileType.PPTX), ("vodafone.xlsx", FileType.XLSX), ("simple.zip", FileType.ZIP), @@ -1201,6 +905,4 @@ def it_distinguishes_the_file_type_of_applicable_zip_files( self, file_name: str, expected_value: FileType | None ): ctx = _FileTypeDetectionContext(example_doc_path(file_name)) - differentiator = _ZipFileDifferentiator(ctx) - - assert differentiator.file_type is expected_value + assert _ZipFileDetector.file_type(ctx) is expected_value diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 0dbfa1eb73..d1e3d3bd18 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.12-dev2" # pragma: no cover +__version__ = "0.16.12-dev3" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index d109cd7384..4c8e4d2be8 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -51,7 +51,11 @@ from unstructured.partition.common.metadata import set_element_hierarchy from unstructured.utils import get_call_args_applying_defaults, lazyproperty -LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) +try: + importlib.import_module("magic") + LIBMAGIC_AVAILABLE = True +except ImportError: + LIBMAGIC_AVAILABLE = False # pyright: ignore[reportConstantRedefinition] def detect_filetype( @@ -133,43 +137,57 @@ def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType: @property def _file_type(self) -> FileType: """FileType member corresponding to this document source.""" - # -- strategy 1: use content-type asserted by caller -- + # -- An explicit content-type most commonly asserted by the client/SDK and is therefore + # -- inherently unreliable. On the other hand, binary file-types can be detected with 100% + # -- accuracy. So start with binary types and only then consider an asserted content-type, + # -- generally as a last resort. + + # -- strategy 1: most binary types can be detected with 100% accuracy -- + if file_type := self._known_binary_file_type: + return file_type + + # -- strategy 2: use content-type asserted by caller -- if file_type := self._file_type_from_content_type: return file_type - # -- strategy 2: guess MIME-type using libmagic and use that -- + # -- strategy 3: guess MIME-type using libmagic and use that -- if file_type := self._file_type_from_guessed_mime_type: return file_type - # -- strategy 3: use filename-extension, like ".docx" -> FileType.DOCX -- + # -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX -- if file_type := self._file_type_from_file_extension: return file_type - # -- strategy 4: give up and report FileType.UNK -- + # -- strategy 5: give up and report FileType.UNK -- return FileType.UNK # == STRATEGIES ============================================================ + @property + def _known_binary_file_type(self) -> FileType | None: + """Detect file-type for binary types we can positively detect.""" + if file_type := _OleFileDetector.file_type(self._ctx): + return file_type + + self._ctx.rule_out_cfb_content_types() + + if file_type := _ZipFileDetector.file_type(self._ctx): + return file_type + + self._ctx.rule_out_zip_content_types() + + return None + @property def _file_type_from_content_type(self) -> FileType | None: """Map passed content-type argument to a file-type, subject to certain rules.""" - content_type = self._ctx.content_type # -- when no content-type was asserted by caller, this strategy is not applicable -- - if not content_type: + if not self._ctx.content_type: return None - # -- OLE-based file-format content_type values are sometimes unreliable. These are - # -- DOC, PPT, XLS, and MSG. - if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type): - return differentiator.file_type - - # -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable -- - if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type): - return differentiator.file_type - # -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it -- - return FileType.from_mime_type(content_type) + return FileType.from_mime_type(self._ctx.content_type) @property def _file_type_from_guessed_mime_type(self) -> FileType | None: @@ -188,24 +206,12 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None: if mime_type is None: return None - if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type): - return differentiator.file_type - if mime_type.endswith("xml"): return FileType.HTML if extension in (".html", ".htm") else FileType.XML if differentiator := _TextFileDifferentiator.applies(self._ctx): return differentiator.file_type - # -- applicable to "application/octet-stream", "application/zip", and all Office 2007+ - # -- document MIME-types, i.e. those for DOCX, PPTX, and XLSX. Note however it does NOT - # -- apply to EPUB or ODT documents, even though those are also Zip archives. The zip and - # -- octet-stream MIME-types are fed in because they are ambiguous. The MS-Office types are - # -- differentiated because they are sometimes mistaken for each other, like DOCX mime-type - # -- is actually a PPTX file etc. - if differentiator := _ZipFileDifferentiator.applies(self._ctx, mime_type): - return differentiator.file_type - # -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment -- if self._ctx.has_code_mime_type: return FileType.TXT @@ -214,14 +220,8 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None: return FileType.EMPTY # -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present -- - if file_type := FileType.from_mime_type(mime_type): - return file_type - - logger.warning( - f"The MIME type{f' of {self._ctx.file_path!r}' if self._ctx.file_path else ''} is" - f" {mime_type!r}. This file type is not currently supported in unstructured.", - ) - return None + file_type = FileType.from_mime_type(mime_type) + return file_type if file_type != FileType.UNK else None @lazyproperty def _file_type_from_file_extension(self) -> FileType | None: @@ -236,6 +236,9 @@ def _file_type_from_file_extension(self) -> FileType | None: class _FileTypeDetectionContext: """Provides all arguments to auto-file detection and values derived from them. + NOTE that `._content_type` is mutable via `.rule_out_*_content_types()` methods, so it should + not be assumed to be a constant value across those calls. + This keeps computation of derived values out of the file-detection code but more importantly allows the main filetype-detector to pass the full context to any delegates without coupling itself to which values it might need. @@ -276,7 +279,7 @@ def new( self._validate() return self - @lazyproperty + @property def content_type(self) -> str | None: """MIME-type asserted by caller; not based on inspection of file by this process. @@ -284,6 +287,8 @@ def content_type(self) -> str | None: present on the response. These are often ambiguous and sometimes just wrong so get some further verification. All lower-case when not `None`. """ + # -- Note `._content_type` is mutable via `.invalidate_content_type()` so this cannot be a + # -- `@lazyproperty`. return self._content_type.lower() if self._content_type else None @lazyproperty @@ -327,12 +332,6 @@ def file_path(self) -> str | None: return os.path.realpath(file_path) if os.path.islink(file_path) else file_path - @lazyproperty - def is_zipfile(self) -> bool: - """True when file is a Zip archive.""" - with self.open() as file: - return zipfile.is_zipfile(file) - @lazyproperty def has_code_mime_type(self) -> bool: """True when `mime_type` plausibly indicates a programming language source-code file.""" @@ -347,9 +346,27 @@ def has_code_mime_type(self) -> bool: return any( lang in mime_type - for lang in "c# c++ cpp csharp java javascript php python ruby swift typescript".split() + for lang in [ + "c#", + "c++", + "cpp", + "csharp", + "java", + "javascript", + "php", + "python", + "ruby", + "swift", + "typescript", + ] ) + @lazyproperty + def is_zipfile(self) -> bool: + """True when file is a Zip archive.""" + with self.open() as file: + return zipfile.is_zipfile(file) + @lazyproperty def mime_type(self) -> str | None: """The best MIME-type we can get from `magic` (or `filetype` package). @@ -401,6 +418,38 @@ def open(self) -> Iterator[IO[bytes]]: file.seek(0) yield file + def rule_out_cfb_content_types(self) -> None: + """Invalidate content-type when a legacy MS-Office file-type is asserted. + + Used before returning `None`; at that point we know the file is not one of these formats + so if the asserted `content-type` is a legacy MS-Office type we know it's wrong and should + not be used as a fallback later in the detection process. + """ + if FileType.from_mime_type(self._content_type) in ( + FileType.DOC, + FileType.MSG, + FileType.PPT, + FileType.XLS, + ): + self._content_type = None + + def rule_out_zip_content_types(self) -> None: + """Invalidate content-type when an MS-Office 2007+ file-type is asserted. + + Used before returning `None`; at that point we know the file is not one of these formats + so if the asserted `content-type` is an MS-Office 2007+ type we know it's wrong and should + not be used as a fallback later in the detection process. + """ + if FileType.from_mime_type(self._content_type) in ( + FileType.DOCX, + FileType.EPUB, + FileType.ODT, + FileType.PPTX, + FileType.XLSX, + FileType.ZIP, + ): + self._content_type = None + @lazyproperty def text_head(self) -> str: """The initial characters of the text file for use with text-format differentiation. @@ -440,27 +489,23 @@ def _validate(self) -> None: raise ValueError("either `file_path` or `file` argument must be provided") -class _OleFileDifferentiator: - """Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be. +class _OleFileDetector: + """Detect and differentiate a CFB file, aka. "OLE" file. - Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office - files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as - `"application/x-ole-storage"` which is true but too not specific enough for partitioning - purposes. + Compound File Binary Format (CFB), aka. OLE file, is use by Microsoft for legacy MS Office + files (DOC, PPT, XLS) as well as for Outlook MSG files. """ def __init__(self, ctx: _FileTypeDetectionContext): self._ctx = ctx @classmethod - def applies( - cls, ctx: _FileTypeDetectionContext, mime_type: str - ) -> _OleFileDifferentiator | None: - """Constructs an instance, but only if this differentiator applies for `mime_type`.""" - return cls(ctx) if cls._is_ole_file(ctx) else None + def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None: + """Specific file-type when file is a CFB file, `None` otherwise.""" + return cls(ctx)._file_type @property - def file_type(self) -> FileType | None: + def _file_type(self) -> FileType | None: """Differentiated file-type for Microsoft Compound File Binary Format (CFBF). Returns one of: @@ -468,34 +513,27 @@ def file_type(self) -> FileType | None: - `FileType.PPT` - `FileType.XLS` - `FileType.MSG` + - `None` when the file is not one of these. """ - # -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return - # -- `None` to trigger fall-back to next strategy. - if not self._is_ole_file(self._ctx): + # -- all CFB files share common magic number, start with that -- + if not self._is_ole_file: return None - # -- check storage contents of the ole file for file type markers - if (ole_file_type := self._check_ole_file_type(self._ctx)) is not None: + # -- check storage contents of the ole file for file-type specific stream names -- + if (ole_file_type := self._ole_file_type) is not None: return ole_file_type - # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so we rely on it - # -- to differentiate those. Note `filetype` doesn't detect MSG type and won't always - # -- detect DOC, PPT, or XLS, returning `None` instead. We let those fall through and we - # -- rely on filename-extension to identify those. - with self._ctx.open() as file: - mime_type = ft.guess_mime(file) - - return FileType.from_mime_type(mime_type) if mime_type else None + return None - @staticmethod - def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool: - """True when file has CFBF magic first 8 bytes.""" - with ctx.open() as file: + @lazyproperty + def _is_ole_file(self) -> bool: + """True when file has CFB magic first 8 bytes.""" + with self._ctx.open() as file: return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" - @staticmethod - def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None: - with ctx.open() as f: + @lazyproperty + def _ole_file_type(self) -> FileType | None: + with self._ctx.open() as f: ole = OleFileIO(f) # pyright: ignore[reportUnknownVariableType] root_storage = Storage.from_ole(ole) # pyright: ignore[reportUnknownMemberType] @@ -537,7 +575,20 @@ def file_type(self) -> FileType: """ extension = self._ctx.extension - if extension in ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .tsv".split(): + if extension in [ + ".csv", + ".eml", + ".html", + ".json", + ".markdown", + ".md", + ".org", + ".p7s", + ".rst", + ".rtf", + ".tab", + ".tsv", + ]: return FileType.from_extension(extension) or FileType.TXT # NOTE(crag): for older versions of the OS libmagic package, such as is currently @@ -616,40 +667,28 @@ def _is_json(self) -> bool: return False -class _ZipFileDifferentiator: - """Refine a Zip-packaged file-type that may be ambiguous or swapped.""" +class _ZipFileDetector: + """Detect and differentiate a Zip-archive file.""" def __init__(self, ctx: _FileTypeDetectionContext): self._ctx = ctx @classmethod - def applies( - cls, ctx: _FileTypeDetectionContext, mime_type: str - ) -> _ZipFileDifferentiator | None: - """Constructs an instance, but only if this differentiator applies for `mime_type`. + def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None: + """Most specific file-type available when file is a Zip file, `None` otherwise. - Separate `mime_type` argument allows it to be applied to either asserted content-type or - guessed mime-type. + MS-Office 2007+ files are detected with 100% accuracy. Otherwise this returns `None`, even + when we can tell it's a Zip file, so later strategies can have a crack at it. In + particular, ODT and EPUB files are Zip archives but are not detected here. """ - return ( - cls(ctx) - if mime_type - in ( - "application/octet-stream", - "application/zip", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ) - else None - ) + return cls(ctx)._file_type @lazyproperty - def file_type(self) -> FileType | None: + def _file_type(self) -> FileType | None: """Differentiated file-type for a Zip archive. - Returns `None` if the file is not a Zip archive. Otherwise it returns `FileType.DOCX`, - `FileType.PPTX`, or `FileType.XLSX` when one of those applies and `FileType.ZIP` otherwise. + Returns `FileType.DOCX`, `FileType.PPTX`, or `FileType.XLSX` when one of those applies, + `None` otherwise. """ if not self._ctx.is_zipfile: return None @@ -657,20 +696,23 @@ def file_type(self) -> FileType | None: with self._ctx.open() as file: zip = zipfile.ZipFile(file) - # NOTE(robinson) - .docx and .xlsx files are actually a zip file with a .docx/.xslx - # extension. If the MIME type is application/octet-stream, we check if it's a - # .docx/.xlsx file by looking for expected filenames within the zip file. - filenames = [f.filename for f in zip.filelist] + filenames = zip.namelist() - if all(f in filenames for f in ("word/document.xml",)): + if "word/document.xml" in filenames: return FileType.DOCX - if all(f in filenames for f in ("xl/workbook.xml",)): + if "xl/workbook.xml" in filenames: return FileType.XLSX - if all(f in filenames for f in ("ppt/presentation.xml",)): + if "ppt/presentation.xml" in filenames: return FileType.PPTX + # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root -- + if "mimetype" in filenames: + with zip.open("mimetype") as f: + mime_type = f.read().decode("utf-8").strip() + return FileType.from_mime_type(mime_type) + return FileType.ZIP