diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py index ee16e74af6..082edfbb3c 100644 --- a/test_unstructured/partition/test_xlsx.py +++ b/test_unstructured/partition/test_xlsx.py @@ -76,6 +76,19 @@ def test_partition_xlsx_from_filename_no_subtables(): ), ] +def test_partition_xlsx_from_filename_no_subtables_no_infer_table_structure(): + """Partition to a single `Table` element per worksheet.""" + assert partition_xlsx("example-docs/stanley-cups.xlsx", find_subtable=False, infer_table_structure=False) == [ + Table( + "\n\n\nStanley Cups\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\n" + "Flyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" + ), + Table( + "\n\n\nStanley Cups Since 67\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n" + "1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" + ), + ] + def test_partition_xlsx_from_filename_no_subtables_no_metadata(): elements = partition_xlsx( diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 1e4d23bf2a..db39da3ee6 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -106,15 +106,12 @@ def partition_xlsx( if not opts.find_subtable: html_text = ( sheet.to_html(index=False, header=opts.include_header, na_rep="") - if opts.infer_table_structure - else None ) - # XXX: `html_text` can be `None`. What happens on this call in that case? - text = soupparser_fromstring(html_text).text_content() # type: ignore + text = soupparser_fromstring(html_text).text_content() if opts.include_metadata: metadata = ElementMetadata( - text_as_html=html_text, + text_as_html=html_text if opts.infer_table_structure else None, page_name=sheet_name, page_number=page_number, filename=opts.metadata_file_path,