Skip to content

Commit

Permalink
feat(eml): EML, MSG process_attachments=True by default
Browse files Browse the repository at this point in the history
Change default of `process_attachments`
  • Loading branch information
scanny committed Oct 7, 2024
1 parent e376863 commit c7e1a0c
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 6 deletions.
3 changes: 2 additions & 1 deletion test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,7 +1230,8 @@ def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type
extensions = file_type._extensions
for file in pathlib.Path(doc_path).iterdir():
if file.is_file() and file.suffix in extensions:
elements = partition_fn(str(file))
# -- attachments get the MIME-type of the attachment, not the email they were in --
elements = partition_fn(str(file), process_attachments=False)
break

assert elements
Expand Down
13 changes: 10 additions & 3 deletions test_unstructured/partition/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def test_partition_email_partitions_an_html_part_with_quoted_printable_encoded_I
elements = partition_email(
example_doc_path("eml/email-no-utf8-2014-03-17.111517.eml"),
content_source="text/html",
process_attachments=False,
)

assert len(elements) == 1
Expand All @@ -135,15 +136,21 @@ def test_partition_email_accepts_a_whitespace_only_file():


def test_partition_email_can_partition_an_empty_email():
assert partition_email(example_doc_path("eml/mime-no-body.eml")) == []
assert (
partition_email(example_doc_path("eml/mime-no-body.eml"), process_attachments=False) == []
)


def test_partition_email_does_not_break_on_an_encrypted_message():
assert partition_email(example_doc_path("eml/fake-encrypted.eml")) == []
assert (
partition_email(example_doc_path("eml/fake-encrypted.eml"), process_attachments=False) == []
)


def test_partition_email_finds_content_when_it_is_marked_with_content_disposition_inline():
elements = partition_email(example_doc_path("eml/email-inline-content-disposition.eml"))
elements = partition_email(
example_doc_path("eml/email-inline-content-disposition.eml"), process_attachments=False
)

assert len(elements) == 1
e = elements[0]
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def partition_email(
content_source: str = "text/html",
metadata_filename: str | None = None,
metadata_last_modified: str | None = None,
process_attachments: bool = False,
process_attachments: bool = True,
**kwargs: Any,
) -> list[Element]:
"""Partitions an .eml file into document elements.
Expand Down
2 changes: 1 addition & 1 deletion unstructured/partition/msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def partition_msg(
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
process_attachments: bool = False,
process_attachments: bool = True,
**kwargs: Any,
) -> list[Element]:
"""Partitions a MSFT Outlook .msg file
Expand Down

0 comments on commit c7e1a0c

Please sign in to comment.