diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 4b56b2bedd..f15f1e2aa0 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1230,7 +1230,8 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type extensions = file_type._extensions for file in pathlib.Path(doc_path).iterdir(): if file.is_file() and file.suffix in extensions: - elements = partition_fn(str(file)) + # -- attachments get the MIME-type of the attachment, not the email they were in -- + elements = partition_fn(str(file), process_attachments=False) break assert elements diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 9c1be47600..eb34d499ca 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -119,6 +119,7 @@ def test_partition_email_partitions_an_html_part_with_quoted_printable_encoded_I elements = partition_email( example_doc_path("eml/email-no-utf8-2014-03-17.111517.eml"), content_source="text/html", + process_attachments=False, ) assert len(elements) == 1 @@ -135,15 +136,21 @@ def test_partition_email_accepts_a_whitespace_only_file(): def test_partition_email_can_partition_an_empty_email(): - assert partition_email(example_doc_path("eml/mime-no-body.eml")) == [] + assert ( + partition_email(example_doc_path("eml/mime-no-body.eml"), process_attachments=False) == [] + ) def test_partition_email_does_not_break_on_an_encrypted_message(): - assert partition_email(example_doc_path("eml/fake-encrypted.eml")) == [] + assert ( + partition_email(example_doc_path("eml/fake-encrypted.eml"), process_attachments=False) == [] + ) def test_partition_email_finds_content_when_it_is_marked_with_content_disposition_inline(): - elements = partition_email(example_doc_path("eml/email-inline-content-disposition.eml")) + elements = partition_email( + example_doc_path("eml/email-inline-content-disposition.eml"), process_attachments=False + ) assert len(elements) == 1 e = elements[0] diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 9b12ee66bc..d194e22d21 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -32,7 +32,7 @@ def partition_email( content_source: str = "text/html", metadata_filename: str | None = None, metadata_last_modified: str | None = None, - process_attachments: bool = False, + process_attachments: bool = True, **kwargs: Any, ) -> list[Element]: """Partitions an .eml file into document elements. diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index fff7b818a2..7c43f4667e 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -24,7 +24,7 @@ def partition_msg( file: Optional[IO[bytes]] = None, metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, - process_attachments: bool = False, + process_attachments: bool = True, **kwargs: Any, ) -> list[Element]: """Partitions a MSFT Outlook .msg file