From 5994e31ff36f6bbd7820a83d65b1d482ea0763d0 Mon Sep 17 00:00:00 2001 From: Luke Boggs Date: Tue, 24 Sep 2024 21:09:16 -0500 Subject: [PATCH] Fix bug causing partition_xlsx to raise error --- test_unstructured/partition/test_xlsx.py | 13 +++++++++++++ unstructured/partition/xlsx.py | 7 ++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py index ee16e74af6..082edfbb3c 100644 --- a/test_unstructured/partition/test_xlsx.py +++ b/test_unstructured/partition/test_xlsx.py @@ -76,6 +76,19 @@ def test_partition_xlsx_from_filename_no_subtables(): ), ] +def test_partition_xlsx_from_filename_no_subtables_no_infer_table_structure(): + """Partition to a single `Table` element per worksheet.""" + assert partition_xlsx("example-docs/stanley-cups.xlsx", find_subtable=False, infer_table_structure=False) == [ + Table( + "\n\n\nStanley Cups\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\n" + "Flyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" + ), + Table( + "\n\n\nStanley Cups Since 67\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n" + "1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" + ), + ] + def test_partition_xlsx_from_filename_no_subtables_no_metadata(): elements = partition_xlsx( diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 1e4d23bf2a..db39da3ee6 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -106,15 +106,12 @@ def partition_xlsx( if not opts.find_subtable: html_text = ( sheet.to_html(index=False, header=opts.include_header, na_rep="") - if opts.infer_table_structure - else None ) - # XXX: `html_text` can be `None`. What happens on this call in that case? - text = soupparser_fromstring(html_text).text_content() # type: ignore + text = soupparser_fromstring(html_text).text_content() if opts.include_metadata: metadata = ElementMetadata( - text_as_html=html_text, + text_as_html=html_text if opts.infer_table_structure else None, page_name=sheet_name, page_number=page_number, filename=opts.metadata_file_path,