diff --git a/CHANGELOG.md b/CHANGELOG.md index 658f4d23f4..e1a1a0c610 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ * **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions. * **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters). * **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive. +* **EML File Partitioning** EML parts with Content Disposition type of `inline` are now included in content map when creating elements * **Fixes an issue in Object Detection metrics** The issue was in preprocessing/validating the ground truth and predicted data for object detection metrics. ## 0.15.1 diff --git a/example-docs/eml/text-part-marked-inline.eml b/example-docs/eml/text-part-marked-inline.eml new file mode 100644 index 0000000000..c1a0e91a9e --- /dev/null +++ b/example-docs/eml/text-part-marked-inline.eml @@ -0,0 +1,47 @@ +msip_labels: + MSIP_Label_5b083577-197b-450c-831d-519cf3f56cd2_ActionId=e50e55f0-3ca8-4485-cd8a-f13d1558cae5; +Received: Thu, 1 Feb 2024 00:00:00 +1000 +From: Example User +To: "Example.User2@someorg.com" +Subject: Project Proposal +Date: Thu, 1 Feb 2024 00:00:00 +1000 +MIME-Version: 1.0 +Content-type: Multipart/mixed; charset=us-ascii; + boundary="B614747692556E4F8C3F55D8444354BC-2432FD0F_message_boundary" +Content-Description: Multipart message + + +--B614747692556E4F8C3F55D8444354BC-2432FD0F_message_boundary +Content-type: Multipart/related; charset=ISO-8859-1; + boundary="A32785A2178ABE448C898C485850D5DD-2432FD0F_message_boundary" +Content-Description: Multipart message + + +--A32785A2178ABE448C898C485850D5DD-2432FD0F_message_boundary +Content-type: Multipart/alternative; charset=ISO-8859-1; + boundary="EF8ECD0282019B449D4B1EBC186DDD07-2432FD0F_message_boundary" +Content-Description: Multipart message + + +--EF8ECD0282019B449D4B1EBC186DDD07-2432FD0F_message_boundary +Content-type: text/plain; charset=us-ascii +Content-Transfer-Encoding: Quoted-printable +Content-Disposition: inline +Content-Description: Message text + +Hi + +=20 + +Please find attached a project proposal. + +Please let us know if you have any questions or would like to discuss= + anything at this stage. + +=20 + +Kind regards + +User + +=20 \ No newline at end of file diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index d39576c241..8461d9966a 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -679,3 +679,13 @@ def test_partition_eml_respects_detect_language_per_element(): assert "eng" in langs assert "spa" in langs + +def test_partition_reads_message_part_with_inline_content_disposition(): + elements = partition_email( + example_doc_path("eml/text-part-marked-inline.eml"), process_attachments=False + ) + + assert len(elements) == 1 + e = elements[0] + assert e.text.startswith("Hi Please find attached a project proposal.") + assert e.text.endswith("Kind regards User ") diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 3370d7534a..7fadc079d3 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -387,9 +387,9 @@ def partition_email( is_encrypted = False content_map: dict[str, str] = {} for part in msg.walk(): - # NOTE(robinson) - content dispostiion is None for the content of the email itself. + # content dispostiion is None/inline for the content of the email itself. # Other dispositions include "attachment" for attachments - if part.get_content_disposition() is not None: + if part.get_content_disposition() not in (None, "inline"): continue content_type = part.get_content_type()