From bfb1cabd2140bd518de5b63e746f99c06dce60bf Mon Sep 17 00:00:00 2001 From: S1M0N38 Date: Sat, 7 Sep 2024 12:30:34 +0200 Subject: [PATCH 1/2] fix: process attchments in partitioning nested emails (#3604) --- unstructured/partition/email.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 76a8729fc0..eddb7aa0d8 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -205,6 +205,8 @@ def extract_attachment_info( list_attachments: list[Any] = [] for part in message.walk(): + if part.is_multipart(): + continue if "content-disposition" in part: cdisp = part["content-disposition"].split(";") cdisp = [clean_extra_whitespace(item) for item in cdisp] From 33b81af07eceeae1f6648e067c0ca3e88333d230 Mon Sep 17 00:00:00 2001 From: S1M0N38 Date: Fri, 27 Sep 2024 20:38:13 +0200 Subject: [PATCH 2/2] fix(msg): add custom attachment partitioner to MsgPartitionerOptions - Added `attachment_partitioner` parameter to `partition_msg` function. - Updated `MsgPartitionerOptions` class to include `attachment_partitioner` attribute. - Modified `_AttachmentPartitioner` class to use the custom `attachment_partitioner` if provided. --- unstructured/partition/msg.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 4c9daa89c9..d20ed3ccac 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -4,7 +4,7 @@ import os import re import tempfile -from typing import IO, Any, Iterator, Optional +from typing import IO, Any, Iterator, Optional, Callable from oxmsg import Message from oxmsg.attachment import Attachment @@ -35,6 +35,7 @@ def partition_msg( metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, process_attachments: bool = False, + attachment_partitioner: Optional[Callable[..., list[Element]]] = None, **kwargs: Any, ) -> list[Element]: """Partitions a MSFT Outlook .msg file @@ -64,6 +65,7 @@ def partition_msg( metadata_file_path=metadata_filename, metadata_last_modified=metadata_last_modified, partition_attachments=process_attachments, + attachment_partitioner=attachment_partitioner, ) return list( @@ -87,6 +89,7 @@ def __init__( metadata_file_path: str | None, metadata_last_modified: str | None, partition_attachments: bool, + attachment_partitioner: Optional[Callable[..., list[Element]]], ): self._date_from_file_object = date_from_file_object self._file = file @@ -94,6 +97,7 @@ def __init__( self._metadata_file_path = metadata_file_path self._metadata_last_modified = metadata_last_modified self._partition_attachments = partition_attachments + self._attachment_partitioner = attachment_partitioner @lazyproperty def is_encrypted(self) -> bool: @@ -140,6 +144,11 @@ def partition_attachments(self) -> bool: """True when message attachments should also be partitioned.""" return self._partition_attachments + @lazyproperty + def attachment_partitioner(self) -> Optional[Callable[..., list[Element]]]: + """The function to use to partition attachments""" + return self._attachment_partitioner + @lazyproperty def partitioning_kwargs(self) -> dict[str, Any]: """Partitioning keyword-arguments to be passed along to attachment partitioner.""" @@ -276,7 +285,6 @@ def iter_elements( def _iter_elements(self) -> Iterator[Element]: """Partition the file in an `oxmsg.attachment.Attachment` into elements.""" - from unstructured.partition.auto import partition with tempfile.TemporaryDirectory() as tmp_dir_path: # -- save attachment as file in this temporary directory -- @@ -285,7 +293,7 @@ def _iter_elements(self) -> Iterator[Element]: f.write(self._file_bytes) # -- partition the attachment -- - for element in partition( + for element in self._opts.attachment_partitioner( detached_file_path, metadata_filename=self._attachment_file_name, metadata_last_modified=self._attachment_last_modified,