diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 3bc9a1ef1ca48..f2390f2305d72 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -406,6 +406,7 @@ I/O - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) +- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements(:issue:`45598`) Period ^^^^^^ diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 384813b6ec65d..856ce52a6d6b6 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -112,7 +112,11 @@ def get_sheet_data( table: list[list[Scalar | NaTType]] = [] for sheet_row in sheet_rows: - sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] + sheet_cells = [ + x + for x in sheet_row.childNodes + if hasattr(x, "qname") and x.qname in cell_names + ] empty_cells = 0 table_row: list[Scalar | NaTType] = [] @@ -243,5 +247,5 @@ def _get_cell_string_value(self, cell) -> str: # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704 value.append(self._get_cell_string_value(fragment)) else: - value.append(str(fragment)) + value.append(str(fragment).strip("\n")) return "".join(value) diff --git a/pandas/tests/io/data/excel/test_newlines.ods b/pandas/tests/io/data/excel/test_newlines.ods new file mode 100644 index 0000000000000..262529800351c Binary files /dev/null and b/pandas/tests/io/data/excel/test_newlines.ods differ diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index ddc3c42710a61..25079b235d332 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -36,3 +36,15 @@ def test_read_writer_table(): result = pd.read_excel("writertable.odt", sheet_name="Table1", index_col=0) tm.assert_frame_equal(result, expected) + + +def test_read_newlines_between_xml_elements_table(): + # GH#45598 + expected = pd.DataFrame( + [[1.0, 4.0, 7], [np.nan, np.nan, 8], [3.0, 6.0, 9]], + columns=["Column 1", "Column 2", "Column 3"], + ) + + result = pd.read_excel("test_newlines.ods") + + tm.assert_frame_equal(result, expected)