From 502b2a957a5e6e1263c2c7ced8ea74152ece0791 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 8 Nov 2024 10:47:12 +0100 Subject: [PATCH 01/15] Add draft of the Excel To Document converter --- docs/pydoc/config/converters_api.yml | 1 + haystack/components/converters/__init__.py | 2 + haystack/components/converters/xlsx.py | 170 ++++++++++++++++++ .../converters/test_xlsx_to_document.py | 84 +++++++++ test/test_files/xlsx/test.xlsx | Bin 0 -> 6073 bytes 5 files changed, 257 insertions(+) create mode 100644 haystack/components/converters/xlsx.py create mode 100644 test/components/converters/test_xlsx_to_document.py create mode 100644 test/test_files/xlsx/test.xlsx diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml index 6c89138c98..69a35d4e5b 100644 --- a/docs/pydoc/config/converters_api.yml +++ b/docs/pydoc/config/converters_api.yml @@ -16,6 +16,7 @@ loaders: "pypdf", "tika", "txt", + "xlsx", ] ignore_when_discovered: ["__init__"] processors: diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py index 4561dd1e0a..2c7ed33505 100644 --- a/haystack/components/converters/__init__.py +++ b/haystack/components/converters/__init__.py @@ -15,6 +15,7 @@ from haystack.components.converters.pypdf import PyPDFToDocument from haystack.components.converters.tika import TikaDocumentConverter from haystack.components.converters.txt import TextFileToDocument +from haystack.components.converters.xlsx import XLSXToDocument __all__ = [ "TextFileToDocument", @@ -31,4 +32,5 @@ "PPTXToDocument", "CSVToDocument", "JSONConverter", + "XLSXToDocument", ] diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py new file mode 100644 index 0000000000..d613c92868 --- /dev/null +++ b/haystack/components/converters/xlsx.py @@ -0,0 +1,170 @@ +import io +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + +import pandas as pd + +from haystack import Document, component, logging +from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata +from haystack.dataclasses import ByteStream +from haystack.lazy_imports import LazyImport + +logger = logging.getLogger(__name__) + +with LazyImport("Run 'pip install openpyxl'") as xlsx_import: + import openpyxl # pylint: disable=unused-import # the library is used but not directly referenced + + +@component +class XLSXToDocument: + """ + Converts XLSX files to Documents. + + By default, it reads all work sheets into CSV format. + + ### Usage example + + ```python + from haystack.components.converters.xlsx import XLSXToDocument + + converter = XLSXToDocument() + results = converter.run(sources=["sample.xlsx"], meta={"date_added": datetime.now().isoformat()}) + documents = results["documents"] + print(documents[0].content) + # 'col1,col2\now1,row1\nrow2row2\n' + ``` + """ + + def __init__( + self, + table_format: Literal["csv", "markdown"] = "csv", + sheet_name: str | int | List[Union[str, int]] | None = None, + read_excel_kwargs: Optional[Dict[str, Any]] = None, + table_format_kwargs: Optional[Dict[str, Any]] = None, + ): + """ + Creates a XLSXToDocument component. + + :param table_format: The format to convert the Excel file to. + :param sheet_name: The name of the sheet to read. If None, all sheets are read. + :param read_excel_kwargs: Additional arguments to pass to `pandas.read_excel`. + See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html#pandas-read-excel + :param table_format_kwargs: Additional keyword arguments to pass to the table format function. + """ + xlsx_import.check() + self.table_format = table_format + self.sheet_name = sheet_name + self.read_excel_kwargs = read_excel_kwargs or {} + self.table_format_kwargs = table_format_kwargs or {} + + @component.output_types(documents=List[Document]) + def run( + self, + sources: List[Union[str, Path, ByteStream]], + meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + ) -> Dict[str, List[Document]]: + """ + Converts a XLSX file to a Document. + + :param sources: + List of file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced documents. + If it's a list, the length of the list must match the number of sources, because the two lists will + be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output documents. + :returns: + A dictionary with the following keys: + - `documents`: Created documents + """ + documents = [] + + meta_list = normalize_metadata(meta, sources_count=len(sources)) + + for source, metadata in zip(sources, meta_list, strict=False): + try: + bytestream = get_bytestream_from_source(source) + except Exception as e: + logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) + continue + + try: + tables, tables_metadata = self._extract_tables(bytestream) + except Exception as e: + logger.warning( + "Could not read {source} and convert it to a Document, skipping. Error: {error}", + source=source, + error=e, + ) + continue + + # Loop over tables and create a Document for each table + for table, excel_metadata in zip(tables, tables_metadata): + merged_metadata = {**bytestream.meta, **metadata, **excel_metadata} + document = Document(content=table, meta=merged_metadata) + documents.append(document) + + return {"documents": documents} + + @staticmethod + def _generate_excel_column_names(n_cols: int) -> List[str]: + result = [] + for i in range(n_cols): + col_name = "" + num = i + while num >= 0: + col_name = chr(num % 26 + 65) + col_name + num = num // 26 - 1 + result.append(col_name) + return result + + def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict]]: + """ + Extract tables from a Excel file. + """ + resolved_read_excel_kwargs = { + **self.read_excel_kwargs, + "sheet_name": self.sheet_name, + "header": None, # Don't assign any pandas column labels + "engine": "openpyxl", # Use openpyxl as the engine to read the Excel file + } + dict_or_df = pd.read_excel(io=io.BytesIO(bytestream.data), **resolved_read_excel_kwargs) + if isinstance(dict_or_df, pd.DataFrame): + dict_or_df = {self.sheet_name: dict_or_df} + + # Drop all columns and rows that are completely empty + keep_index = True + out_header = False if self.table_format == "csv" else () + for key in dict_or_df: + df = dict_or_df[key] + # row starts at 1 + df.index = df.index + 1 + # column names are alphabet characters + header = self._generate_excel_column_names(df.shape[1]) + df.columns = header + out_header = True if self.table_format == "csv" else header + df = df.dropna(axis=1, how="all", ignore_index=False) + df = df.dropna(axis=0, how="all", ignore_index=False) + dict_or_df[key] = df + + tables = [] + metadata = [] + for key in dict_or_df: + if self.table_format == "csv": + resolved_kwargs = {"index": keep_index, "header": out_header, **self.table_format_kwargs} + tables.append(dict_or_df[key].to_csv(**resolved_kwargs)) + elif self.table_format == "markdown": + resolved_kwargs = { + "index": keep_index, + "headers": out_header, + "tablefmt": "pipe", # tablefmt 'plain', 'simple', 'grid', 'pipe', 'orgtbl', 'rst', 'mediawiki', + # 'latex', 'latex_raw', 'latex_booktabs', 'latex_longtable' and tsv + **self.table_format_kwargs, + } + tables.append(dict_or_df[key].to_markdown(**resolved_kwargs)) + else: + raise ValueError(f"Unsupported export format: {self.table_format}. Choose either 'csv' or 'markdown'.") + metadata.append({"xlsx": {"sheet_name": key}}) + return tables, metadata diff --git a/test/components/converters/test_xlsx_to_document.py b/test/components/converters/test_xlsx_to_document.py new file mode 100644 index 0000000000..afe7eae3c9 --- /dev/null +++ b/test/components/converters/test_xlsx_to_document.py @@ -0,0 +1,84 @@ +import logging + +import pytest + +from haystack.components.converters.xlsx import XLSXToDocument + + +class TestXLSXToDocument: + def test_init(self) -> None: + converter = XLSXToDocument() + assert converter.sheet_name is None + assert converter.read_excel_kwargs == {} + assert converter.table_format == "csv" + assert converter.table_format_kwargs == {} + + def test_run(self, test_files_path) -> None: + converter = XLSXToDocument() + paths = [test_files_path / "xlsx" / "test.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 2 + assert documents[0].content == ",A,B\n1,col_a,col_b\n2,1.5,test\n" + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "test.xlsx"), + "xlsx": {"sheet_name": "Sheet1"}, + } + assert documents[1].content == ",A,B\n1,col_c,col_d\n2,True,\n" + assert documents[1].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "test.xlsx"), + "xlsx": {"sheet_name": "Sheet2"}, + } + + @pytest.mark.parametrize( + "sheet_name, expected_sheet_name, expected_content", + [ + ("Sheet1", "Sheet1", ",A,B\n1,col_a,col_b\n2,1.5,test\n"), + ("Sheet2", "Sheet2", ",A,B\n1,col_c,col_d\n2,True,\n"), + (0, 0, ",A,B\n1,col_a,col_b\n2,1.5,test\n"), + (1, 1, ",A,B\n1,col_c,col_d\n2,True,\n"), + ], + ) + def test_run_sheet_name( + self, sheet_name: int | str, expected_sheet_name: str, expected_content: str, test_files_path + ) -> None: + converter = XLSXToDocument(sheet_name=sheet_name) + paths = [test_files_path / "xlsx" / "test.xlsx"] + results = converter.run(sources=paths) + documents = results["documents"] + assert len(documents) == 1 + assert documents[0].content == expected_content + assert documents[0].meta == { + "file_path": str(test_files_path / "xlsx" / "test.xlsx"), + "xlsx": {"sheet_name": expected_sheet_name}, + } + + def test_run_with_read_excel_kwargs(self, test_files_path) -> None: + converter = XLSXToDocument(sheet_name="Sheet1", read_excel_kwargs={"skiprows": 1}) + paths = [test_files_path / "xlsx" / "test.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 1 + assert documents[0].content == ",A,B\n1,1.5,test\n" + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "test.xlsx"), + "xlsx": {"sheet_name": "Sheet1"}, + } + + def test_run_error_wrong_file_type(self, caplog: pytest.LogCaptureFixture, test_files_path) -> None: + converter = XLSXToDocument() + sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"] + with caplog.at_level(logging.WARNING): + results = converter.run(sources=sources) + assert "sample_pdf_1.pdf and convert it" in caplog.text + assert results["documents"] == [] + + def test_run_error_non_existent_file(self, caplog: pytest.LogCaptureFixture) -> None: + converter = XLSXToDocument() + paths = ["non_existing_file.docx"] + with caplog.at_level(logging.WARNING): + converter.run(sources=paths) + assert "Could not read non_existing_file.docx" in caplog.text diff --git a/test/test_files/xlsx/test.xlsx b/test/test_files/xlsx/test.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..6e09797c6b3873254476bfa139a8d9fe1274d1e6 GIT binary patch literal 6073 zcmeHLcR1Vc+DB?{QL{=V)TpgijTTLfsy%BH#0W(zR;gJrN{rfjQ)=(lsI5wey&9XM zs%A@%=&o-VbAO(DeC|h08E}poi-3RtOI$!gAM2FS9X})N z`7E8y++c8Pmw!G8@FE=SQwEISt^6eTpMjsqw5MPObU-<3%_Z)r#%S zPL^?Xi&SxN+m{(nJ?pDrtuawk#wWlxhxy}TV`1R~u&~truT%>DKb8M2m2QsCb}lwh zsH+R#$;Z#AjOm7F^HT(+nNDHvPthqzM6<9cu_M`QI1=YS(Y@1K!yUC+u1RZEDK_S{ zfj`(BUQ~sKqVJ(t8RE+fi>Pw$`LX8am(-2LF71{Mzm`H4g9_PYxpguD6-#9YI$>j< z9D3tJvC{2B3>NDzSSa6R4++2DN$mxzjC$2F^y&E;38=~ug=4Mg9L)BSh>5UHzimLJ zjni|m^6Bk5&@28M7na83+;&3Q^SG1e-rE#0(rWV1QqigPvG+6%9416&+l7GzzP%X7 zDhGtB^44BMl={*uy=RQ&3sV`0_jDGMVb0zoQe9q?*eEiSo(UHE#yNS!%&Fp(Xq*o3 z-o7$qN6Dbz%)~y?^+mq06*R4)y);EwGSU!Y&u6w5Y|Z{vj}=j?{Lz2IK$O-=duCvf z)<=wSty>~7Z8LA^oBNhd+SL>*lf%Hagvy#by=k8i^2p4@tJ$>c!8AVZYXh;Q0Vp5tyxmaoh7F#_ywG;?bFrC4*p-CTo zi*DQ8!HGA*t{`TS7APU5|2R#Nkla?=cQ@rAByjr1HaRAfQR}| zv?`M9juGkT*V(`U?2S4-b^($Yu+!04Qya>fM&2l-6uAw=aPTaDRi)zCL#|^U{$n*c z#lcAm{H!L?z3^EU>Y%jr4b!qGA@8zjdmmhN(5`AIw6z{OH%k{7io1uZ*$}v={?@*D zv}JQ&Mw-cKhhKs6exF{6b_5C$a-0IZbz}2Ot6R`}!K|x~xk3hz^GnTFJUB|qjgy+5 z(YsJ$#k^KuGmJL3hrI@umBHiv_+mv+Vk3!6)!}B@UoUZfwykSyK{*9R1ibGvTPj4y zNZE}k^${ootHRwEjXBBnN0gDco_JXfZE^TIRJ}dFrCKK%b|VkChBK=?92SIT`#?&CXdo zbl)HV50xBDC@Lrw#k$qyrfFG!-;a9-&N{9{cl0reaB`90;=0z}Ov6|>uA!ya+;=pe zb(56-ty!Td+a_jF$9esw%H4Jov1G-&tCe2RP7oD!Y%%LICsl**vpGc>)kwz@xkI_# z40qr(9MX7M8h8uYwfSA4?4~fPfIQxo$V_J8ja{SG}$zWm@aGsF%&CkcQ+^raGwTXUowcAI78 zv+k7#fv-@fmx#X9;k%fjU6>PTo(7|7+cG_yh~Zq2Cbz-PAJN#Mz);o}|re zpuJMPB2>1S(=vi10y}y^=jq_3oM{Dun&8X|1q%r^NA?!Iq{k%>2o<)rvoX=5prDyE z>p@@rzU}hE9s5yad|>=SSn)?79OR`KQMUr^fcVg6%gWlFb_q9%69Y@ zd{0OoyG4H#z14)Xgf3j`_VwaY2mk~&EO6$sNf}d0!pUF7Gi^Vvh-rGq#UmgH_+Vx| zng4PXRFD;+p+x>A1XU`lOeL<;)+D$R^1v2l2)5PZ8*XX8uqrT9_dTU&E^X24t1ScFyS5N)$9uuj_wGq)#V9J{3~}G=0zXG%IfP|Z|y{ieD$Cq*7Pk^G~sL2 zE9P!xq7-m5x!hXapJOhbX>KNyheNNej*mN2ZntNgL+}_&lPd0h;dj*!Ql2?^s8P~! zdP2wLlxJ_5C_5gkBKfP${N&z>rqvDg&Lav^)akd{J>N5sZs7IQ3w_S-PhRCt#G($yf8wAP z9XHh3dtrq(L7tLuE+8kR(D8it(y*vsui`_kOmf9Ng7u}SAI)6qM-#k2x?EY2Q4&qK z{klkK(`unK62vEY=$moVZ#BZ)c;UR^6Gm~!1p}B2mvCHb1Hvbwl$f=uso9od@&wQ@Kxgw-dJoYs_`Vq@kEwbXQV#!LYi#p>|p2Bx{<4xBXC zBrXnC#d-?KMjiS^Mmp)4~?U?jP*%HtsPqJ#~ zJQ;Op-dIh-Kr16XR}Ft(W{0jhakxUwaH5=lw@cKu?6SIfX?J=HpW%t^B|A6;Ta|}V z+}#u}u-YYZFpWq$9`Xq``O8lvx2#_UP8c^1qQj0_1BQL5EfcPhxag_oce1SmFn$6r zE)VA(dx8JQeoFe6pRVrq&=Uu%v|9dEN>bY!kcI5!O!Mgcr#q3)-g9?6hhTT$h*W*s zW(u2}`~b0~E|4DSOZwPEZ{@VlZy;x!L(X7$vnI)pYWbY&7!%il#q@+%23USw0^Zz> zydyXee4t5>ww!a}gMzzeWN&_>3-;{rTgsU?1Ngfxsn75{u{i(``#)_YhDBes*QHKN zm>(PLbCQ8O*43iR3=jnAXoxQ~J%;8>;&Q~K0Iv?ar^*&YBJSJMSttKWz?1q9kWN@X zawrFf7C`CbT9(;>o8S96snOYgQETQ#Sv|FBGtkF|drY|fJ9y+Cx64e{@C4YV-vayD zgBf6#pxs03lK57&8G!KCWtJ3?1wprSD^#IjZyWGP%Zqp4`5KThCczeJedHOvmu90E z-;KuY`}gqdk==rQn0k9&DI*j}9NQt8x4B=yD|_pvbKh`DrdV)@tC2ah_broyg|0uj zf5JrpHfv~Zw;EtnseaK}|NCvXU4cpUCZR|5b4AhFQSElyQi`vW!}&pQntXV z-WQ={m#jro6Qi5*WK<4M;=Wv>yz#?vER_7Q%aZ?$2aLWlcXYHn8PZUba6f*!pimDUJ5L!0B%e_>pactwf>2c(+PvkJ}wuWhZoI!V4Nv<64G-=rMTFD?+VT(IVMQKEm zos!PX1s1`%zyC1pnc^HkR9tuHx)Se9H?QaDk%60po+pyu(Mzd^qA);%PibJ{jRP*8 zF43wmx!C<%E8O^!yKL77ki6)h~#RR_w2)|J!%JU>}bI?!O65GK&B;Nm8sD?lbrp^yE8a%atB4$Gs=v~&XleOclG1bu>9lf%<%Ixe#-pQXL2HSnougXZTu90ssq;^ zj}DioRL4+>CsB1(tB|u4mqE>yWfPC~c2Y!y@3X}7-quA+32b=VR8D!3G$GpJB{rz3 z^~rFI)>IY^aQVKiZA(3tAETf23?_oyeqK;Aq&i}UPOxl-AomT&#CEfZ-8?P zn`v5Dnk-STLkG`@*)_DhZ|fR;9=%!$6q%w<;g*Y4#tb-O8y8j4oy{sW!FP4dkfnOV zioY_#%DESJtD;McWDXkStQaJ3@x~G1C$>^|@F8j)49>zQt1<+B=d!%;>$5FkUugt+ml{Qua>)XJr7pYy0)efPW&`_f9qjx@l$tm@~>tr-05A;rZ(r7TNtp1 zcN(Rzh}8{vE|+tdwqxC|;^Y(Sj)%+2)HYL@`};~m+~y?nFIv+JI@`R<1@ya4LU0eCgoKz22L$KaY8|+{{ZxRAlcH}40V?P4ROg-+n zlF?j*@{!bQIv#=W39=nCSjvwb;rL?-$7sU4a*l9UDBRUV%iRg;VuVTCcCB$+9080b zY(ov3XG+It7k!BCXsLO1uUYnsCy)pwIE3>K(J6@E zEvIBI$8ng?E)PX{_nYAD7N-;F_&uOeu9h>eSblB!VoTvBQMFb%JLCt;Ic-t~mUYRo zfcLy~k70~7t#VZXxX4W>#APmP(3sd`9{i*`sn3IABib$8rZk?CG!7T~#e0RwcOVxX zaIPh-8Gi@C9pU#Hh!^8v1JlngUj)&9pgifPHJ?*IpZU3?y-Uzr~ z`n8mNaJ&AGkLMkU7PTS@cudo@@ai}Ha#F3~*GFI49`@GF-!Pf$7dgbk z@UO1<>7C3m_w=|Jsep|`jrHrc@bu*u=C<&6`*(MT|9ceW77rxf6TbGLH9h3gvGv9?k-up6hCo%zXSf7;?JU-9*ln9 zRy6)ulvCUKH4&Z#JS_k*x%1272~GgdPFH6EPAe;nPW)xqBqsp>Lp7d7INgge3hb9H z|Ap``J$82K>BfO+*1v53SkV4O>OWie*~O=8C#JXjvS5mn#s8+$oW(g^m@t+0m$gv- jR(H=XJe^U$b6-sLw^d9{85a+OPI~-uJFfUwP6+=83K5}# literal 0 HcmV?d00001 From 49e9c4f1b8a50baf6a0f4f2d059633c22289f12e Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 8 Nov 2024 10:51:13 +0100 Subject: [PATCH 02/15] Add license header --- haystack/components/converters/xlsx.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index d613c92868..ea577f3a7f 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + import io from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Tuple, Union From 6d89fd9044a1ab1ca51f9e9b96f2ec12f0667e41 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 8 Nov 2024 10:52:48 +0100 Subject: [PATCH 03/15] Add release note --- .../add-excel-to-document-converter-1920c9f9902ddf17.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 releasenotes/notes/add-excel-to-document-converter-1920c9f9902ddf17.yaml diff --git a/releasenotes/notes/add-excel-to-document-converter-1920c9f9902ddf17.yaml b/releasenotes/notes/add-excel-to-document-converter-1920c9f9902ddf17.yaml new file mode 100644 index 0000000000..7ae6ca0aca --- /dev/null +++ b/releasenotes/notes/add-excel-to-document-converter-1920c9f9902ddf17.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Add XLSXToDocument converter that loads an Excel file using Pandas + openpyxl and by default converts each sheet into a separate Document in a CSV format. From 341ad2cf18274422425b321938a05a160ad32a7c Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 8 Nov 2024 10:57:36 +0100 Subject: [PATCH 04/15] Use Union instead of pipe --- haystack/components/converters/xlsx.py | 2 +- test/components/converters/test_xlsx_to_document.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index ea577f3a7f..ee2204d936 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -42,7 +42,7 @@ class XLSXToDocument: def __init__( self, table_format: Literal["csv", "markdown"] = "csv", - sheet_name: str | int | List[Union[str, int]] | None = None, + sheet_name: Union[str, int, List[Union[str, int]], None] = None, read_excel_kwargs: Optional[Dict[str, Any]] = None, table_format_kwargs: Optional[Dict[str, Any]] = None, ): diff --git a/test/components/converters/test_xlsx_to_document.py b/test/components/converters/test_xlsx_to_document.py index afe7eae3c9..8088458cd3 100644 --- a/test/components/converters/test_xlsx_to_document.py +++ b/test/components/converters/test_xlsx_to_document.py @@ -1,4 +1,5 @@ import logging +from typing import Union import pytest @@ -42,7 +43,7 @@ def test_run(self, test_files_path) -> None: ], ) def test_run_sheet_name( - self, sheet_name: int | str, expected_sheet_name: str, expected_content: str, test_files_path + self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path ) -> None: converter = XLSXToDocument(sheet_name=sheet_name) paths = [test_files_path / "xlsx" / "test.xlsx"] From 9a2b60c168040e79a942d7637f9cd102e1182043 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 8 Nov 2024 11:44:59 +0100 Subject: [PATCH 05/15] Add openpyxl as additional dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 0367b672ed..17cea1e2d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,6 +106,7 @@ extra-dependencies = [ "python-pptx", # PPTXToDocument "python-docx", # DocxToDocument "jq", #JSONConverter + "openpyxl", # XLSXToDocument "nltk", # NLTKDocumentSplitter From 6a0072de08e42b880b9647767e984884170fc95e Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 8 Nov 2024 11:55:26 +0100 Subject: [PATCH 06/15] Fix zip issue --- haystack/components/converters/xlsx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index ee2204d936..3d2d8f57a4 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -87,7 +87,7 @@ def run( meta_list = normalize_metadata(meta, sources_count=len(sources)) - for source, metadata in zip(sources, meta_list, strict=False): + for source, metadata in zip(sources, meta_list): try: bytestream = get_bytestream_from_source(source) except Exception as e: From a428d8c9fcbdc4554c22916cc1da8408873c7ac1 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 18 Nov 2024 11:03:28 +0100 Subject: [PATCH 07/15] few updates from Bijay --- haystack/components/converters/xlsx.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index 3d2d8f57a4..9e9b72fd02 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -139,8 +139,7 @@ def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict] dict_or_df = {self.sheet_name: dict_or_df} # Drop all columns and rows that are completely empty - keep_index = True - out_header = False if self.table_format == "csv" else () + out_header = True if self.table_format == "csv" else () for key in dict_or_df: df = dict_or_df[key] # row starts at 1 @@ -148,20 +147,19 @@ def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict] # column names are alphabet characters header = self._generate_excel_column_names(df.shape[1]) df.columns = header - out_header = True if self.table_format == "csv" else header - df = df.dropna(axis=1, how="all", ignore_index=False) - df = df.dropna(axis=0, how="all", ignore_index=False) + if self.table_format == "markdown": + out_header = header dict_or_df[key] = df tables = [] metadata = [] for key in dict_or_df: if self.table_format == "csv": - resolved_kwargs = {"index": keep_index, "header": out_header, **self.table_format_kwargs} + resolved_kwargs = {"index": True, "header": out_header, **self.table_format_kwargs} tables.append(dict_or_df[key].to_csv(**resolved_kwargs)) elif self.table_format == "markdown": resolved_kwargs = { - "index": keep_index, + "index": True, "headers": out_header, "tablefmt": "pipe", # tablefmt 'plain', 'simple', 'grid', 'pipe', 'orgtbl', 'rst', 'mediawiki', # 'latex', 'latex_raw', 'latex_booktabs', 'latex_longtable' and tsv From d6f9050007f72ca9e1abaea06c7f835f4ad46ffa Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 11 Dec 2024 14:52:41 +0100 Subject: [PATCH 08/15] Update deps --- haystack/components/converters/xlsx.py | 25 ++++++++++++++----------- pyproject.toml | 3 ++- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index 9e9b72fd02..37e0f38bb8 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -18,6 +18,9 @@ with LazyImport("Run 'pip install openpyxl'") as xlsx_import: import openpyxl # pylint: disable=unused-import # the library is used but not directly referenced +with LazyImport("Run 'pip install tabulate'") as tabulate_import: + from tabulate import tabulate # pylint: disable=unused-import # the library is used but not directly referenced + @component class XLSXToDocument: @@ -57,6 +60,10 @@ def __init__( """ xlsx_import.check() self.table_format = table_format + if table_format not in ["csv", "markdown"]: + raise ValueError(f"Unsupported export format: {table_format}. Choose either 'csv' or 'markdown'.") + if table_format == "markdown": + tabulate_import.check() self.sheet_name = sheet_name self.read_excel_kwargs = read_excel_kwargs or {} self.table_format_kwargs = table_format_kwargs or {} @@ -138,35 +145,31 @@ def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict] if isinstance(dict_or_df, pd.DataFrame): dict_or_df = {self.sheet_name: dict_or_df} - # Drop all columns and rows that are completely empty - out_header = True if self.table_format == "csv" else () for key in dict_or_df: df = dict_or_df[key] - # row starts at 1 + # Row starts at 1 in Excel df.index = df.index + 1 - # column names are alphabet characters + # Excel column names are Alphabet Characters header = self._generate_excel_column_names(df.shape[1]) df.columns = header - if self.table_format == "markdown": - out_header = header dict_or_df[key] = df tables = [] metadata = [] for key in dict_or_df: if self.table_format == "csv": - resolved_kwargs = {"index": True, "header": out_header, **self.table_format_kwargs} + resolved_kwargs = {"index": True, "header": True, **self.table_format_kwargs} tables.append(dict_or_df[key].to_csv(**resolved_kwargs)) - elif self.table_format == "markdown": + else: resolved_kwargs = { "index": True, - "headers": out_header, + "headers": "firstrow", "tablefmt": "pipe", # tablefmt 'plain', 'simple', 'grid', 'pipe', 'orgtbl', 'rst', 'mediawiki', # 'latex', 'latex_raw', 'latex_booktabs', 'latex_longtable' and tsv **self.table_format_kwargs, } + # to_markdown uses tabulate tables.append(dict_or_df[key].to_markdown(**resolved_kwargs)) - else: - raise ValueError(f"Unsupported export format: {self.table_format}. Choose either 'csv' or 'markdown'.") + # add sheet_name to metadata metadata.append({"xlsx": {"sheet_name": key}}) return tables, metadata diff --git a/pyproject.toml b/pyproject.toml index 0e8946c3b4..4ffffa9785 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,8 +105,9 @@ extra-dependencies = [ "trafilatura", # HTMLToDocument "python-pptx", # PPTXToDocument "python-docx", # DocxToDocument - "jq", #JSONConverter + "jq", # JSONConverter "openpyxl", # XLSXToDocument + "tabulate", # XLSXToDocument "nltk", # NLTKDocumentSplitter From 039aea4dfc880d26509fb4a7d9f11c70313d94ae Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 11 Dec 2024 14:57:02 +0100 Subject: [PATCH 09/15] Add markdown test --- haystack/components/converters/xlsx.py | 2 +- .../converters/test_xlsx_to_document.py | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index 37e0f38bb8..d36c86aba1 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -163,7 +163,7 @@ def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict] else: resolved_kwargs = { "index": True, - "headers": "firstrow", + "headers": dict_or_df[key].columns, "tablefmt": "pipe", # tablefmt 'plain', 'simple', 'grid', 'pipe', 'orgtbl', 'rst', 'mediawiki', # 'latex', 'latex_raw', 'latex_booktabs', 'latex_longtable' and tsv **self.table_format_kwargs, diff --git a/test/components/converters/test_xlsx_to_document.py b/test/components/converters/test_xlsx_to_document.py index 8088458cd3..ebd362b72a 100644 --- a/test/components/converters/test_xlsx_to_document.py +++ b/test/components/converters/test_xlsx_to_document.py @@ -33,6 +33,31 @@ def test_run(self, test_files_path) -> None: "xlsx": {"sheet_name": "Sheet2"}, } + def test_run_markdown(self, test_files_path) -> None: + converter = XLSXToDocument(table_format="markdown") + paths = [test_files_path / "xlsx" / "test.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 2 + assert ( + documents[0].content + == "| | A | B |\n|---:|:------|:------|\n| 1 | col_a | col_b |\n| 2 | 1.5 | test |" + ) + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "test.xlsx"), + "xlsx": {"sheet_name": "Sheet1"}, + } + assert ( + documents[1].content + == "| | A | B |\n|---:|:------|:------|\n| 1 | col_c | col_d |\n| 2 | True | nan |" + ) + assert documents[1].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "test.xlsx"), + "xlsx": {"sheet_name": "Sheet2"}, + } + @pytest.mark.parametrize( "sheet_name, expected_sheet_name, expected_content", [ From 744e21ac4c969f4e0a587cdb3e7e11e472093ffc Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 11 Dec 2024 15:08:01 +0100 Subject: [PATCH 10/15] Adding more example excels and expanding tests --- .../converters/test_xlsx_to_document.py | 36 +++++++++--------- .../xlsx/basic_tables_two_sheets.xlsx | Bin 0 -> 11650 bytes test/test_files/xlsx/multiple-tables.xlsx | Bin 0 -> 4854 bytes .../xlsx/table_empty_rows_and_columns.xlsx | Bin 0 -> 4743 bytes test/test_files/xlsx/test.xlsx | Bin 6073 -> 0 bytes 5 files changed, 18 insertions(+), 18 deletions(-) create mode 100644 test/test_files/xlsx/basic_tables_two_sheets.xlsx create mode 100644 test/test_files/xlsx/multiple-tables.xlsx create mode 100644 test/test_files/xlsx/table_empty_rows_and_columns.xlsx delete mode 100644 test/test_files/xlsx/test.xlsx diff --git a/test/components/converters/test_xlsx_to_document.py b/test/components/converters/test_xlsx_to_document.py index ebd362b72a..784ae4c43f 100644 --- a/test/components/converters/test_xlsx_to_document.py +++ b/test/components/converters/test_xlsx_to_document.py @@ -16,26 +16,26 @@ def test_init(self) -> None: def test_run(self, test_files_path) -> None: converter = XLSXToDocument() - paths = [test_files_path / "xlsx" / "test.xlsx"] + paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) documents = results["documents"] assert len(documents) == 2 assert documents[0].content == ",A,B\n1,col_a,col_b\n2,1.5,test\n" assert documents[0].meta == { "date_added": "2022-01-01T00:00:00", - "file_path": str(test_files_path / "xlsx" / "test.xlsx"), - "xlsx": {"sheet_name": "Sheet1"}, + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Basic Table"}, } assert documents[1].content == ",A,B\n1,col_c,col_d\n2,True,\n" assert documents[1].meta == { "date_added": "2022-01-01T00:00:00", - "file_path": str(test_files_path / "xlsx" / "test.xlsx"), - "xlsx": {"sheet_name": "Sheet2"}, + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Table Missing Value"}, } def test_run_markdown(self, test_files_path) -> None: converter = XLSXToDocument(table_format="markdown") - paths = [test_files_path / "xlsx" / "test.xlsx"] + paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) documents = results["documents"] assert len(documents) == 2 @@ -45,8 +45,8 @@ def test_run_markdown(self, test_files_path) -> None: ) assert documents[0].meta == { "date_added": "2022-01-01T00:00:00", - "file_path": str(test_files_path / "xlsx" / "test.xlsx"), - "xlsx": {"sheet_name": "Sheet1"}, + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Basic Table"}, } assert ( documents[1].content @@ -54,15 +54,15 @@ def test_run_markdown(self, test_files_path) -> None: ) assert documents[1].meta == { "date_added": "2022-01-01T00:00:00", - "file_path": str(test_files_path / "xlsx" / "test.xlsx"), - "xlsx": {"sheet_name": "Sheet2"}, + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Table Missing Value"}, } @pytest.mark.parametrize( "sheet_name, expected_sheet_name, expected_content", [ - ("Sheet1", "Sheet1", ",A,B\n1,col_a,col_b\n2,1.5,test\n"), - ("Sheet2", "Sheet2", ",A,B\n1,col_c,col_d\n2,True,\n"), + ("Basic Table", "Basic Table", ",A,B\n1,col_a,col_b\n2,1.5,test\n"), + ("Table Missing Value", "Table Missing Value", ",A,B\n1,col_c,col_d\n2,True,\n"), (0, 0, ",A,B\n1,col_a,col_b\n2,1.5,test\n"), (1, 1, ",A,B\n1,col_c,col_d\n2,True,\n"), ], @@ -71,27 +71,27 @@ def test_run_sheet_name( self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path ) -> None: converter = XLSXToDocument(sheet_name=sheet_name) - paths = [test_files_path / "xlsx" / "test.xlsx"] + paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] results = converter.run(sources=paths) documents = results["documents"] assert len(documents) == 1 assert documents[0].content == expected_content assert documents[0].meta == { - "file_path": str(test_files_path / "xlsx" / "test.xlsx"), + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), "xlsx": {"sheet_name": expected_sheet_name}, } def test_run_with_read_excel_kwargs(self, test_files_path) -> None: - converter = XLSXToDocument(sheet_name="Sheet1", read_excel_kwargs={"skiprows": 1}) - paths = [test_files_path / "xlsx" / "test.xlsx"] + converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1}) + paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) documents = results["documents"] assert len(documents) == 1 assert documents[0].content == ",A,B\n1,1.5,test\n" assert documents[0].meta == { "date_added": "2022-01-01T00:00:00", - "file_path": str(test_files_path / "xlsx" / "test.xlsx"), - "xlsx": {"sheet_name": "Sheet1"}, + "file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"), + "xlsx": {"sheet_name": "Basic Table"}, } def test_run_error_wrong_file_type(self, caplog: pytest.LogCaptureFixture, test_files_path) -> None: diff --git a/test/test_files/xlsx/basic_tables_two_sheets.xlsx b/test/test_files/xlsx/basic_tables_two_sheets.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..15254ddb8c99cedc74a0fec694ef411cc5a45c36 GIT binary patch literal 11650 zcmeI22|Sc*`^PO=LTKztIL49^gHTLDX(Um0VHjkm>|!WugP80~NQFdMBm2IEu^m~) z7;9x;$3EkaPA8}Lyyv{{dCzk%*F# zk|os4dfriZNUh#gE%5naQJT-)1xLoY(!$SY97ZX* zXrVHvE-ziAjQ6N}m&`XhRZSXCI<`zu>Gdl-OSLAKczjce+m9KxNw#)-I4gtV%?+^H zHgSWuk5~{%phUFbyXlkt5P7`JVJQMT4%2kA;(ZUa3B%Y(%%NI)x>(^}Q(!KiVMT|8&jfqwmJ)HmVZ;t{A+LGcenWE)DKpvATON zucUOqFnfN;@Cjj=Q`VCqoqwx#Y3`nTY56UKCxnuLWG|16nB~CPISk?U+w)J3QxDDx zU$>_XT+41pt!hhVwmIxrZFxDnZYN8kF}K~N2=6>R3cV|D31xHI4-6(v)IUqMjh-&; zZf7!_f6=homVoP-+M4ab-*$Fg*EcOHF*p%gi^XPP=e;uvH`X5cBWB~K6UcLvzkUIU3U^OB8pJH7K-Md)4mG2bnxkxw{uLTx&=dCVvC z_BF8AayD+tc&C1iKl#KMv=Hr@?6tnQ=_(2DUkz-m3WFPB=IPLFSncFeNIHLSrew<& zx)xhlv||Y6vqJG? zx}|=yJ^Xm*T1)NHl4lWIsvkVG{9~;}I`q+EUw^uyH@I*>wh(NH;mr1Wr~f1wzOJ#F zQSS;Zg0u^vLg8T7*o@k3jk(;8%+)~Dl6K<6&XDd#Hwu-6*0kx(F=%ulMUxdK^qmnZ z7)h4pgosMO5i~}CwwG>n`QHodYG79I5I2)JMY^uU%Evg&Q9DnVDY2FtTqe_~E z=;er$@z**{gZR|NnAnGX7e(X*^%| zdQ2r>w|HDyv$#r{rKmx~Wc;;W(-OY!vvDZzn4b*+d}88pDa|x0w=F~!BG|V1^S>DU z_!#1^S%{`I|8!EF&!)mB9+%O4P35+gsCqfyBAU{GGqD+k&e}D<@ZUYcqoWji1~>niLVfA6PpveJg5^v>W~AvFT##N zLP!zLTp&8&>VvwANF7Q5gB*+p5^@L;#RXyjy7<(oA$6z#>TI z2&qE@h>(McK|;t8JzSt;Ko{S-J4hW`K(`!B8WKW*z;S^%fvdiC_mMjE00w!O0wjbI z!Nd*X0lN6rA(1)&fVw#T2aV;Hc@GX+vZI(G;xXC z`C_KOV_d~iR1l4jouZfGf%*;y(|a=c2rDc&Q&!MF_(!V$<2U*Xe%nw{kG_?gRsXX1zv$cVRF6tb z$rICX33!iE>t_>`RxmSvIYaa38!H-p?DK|8Pp?Z*3-_5n5L-%Q_RJ*1T31WnGWz%2 z@tKm)3ykBwbg!NlBRhUQUX%3o>3_H-6GufRFrCRecii)6Q?9TY2ypU=;I zMoZ`Yb-p>d1fWnI{cPu@70k^)&(I_$rsRwLXBbP8c}6Id)|CKWk$%a%BY{d~!{Ucg zP3;j2q`ZB9w!_sZkv=wZ=~^Rm))^Y#MB^MW8kd0obYrdRo0aFM@H1Cs%eqvvSH@2D zvo%oD=8T7IQ!4*4?2=gi*qqjqJg8WDQ(VFtHA5s!!u{tv}x75(f|PZ&Yc=xj~^7dz1j zkDd+JOtiLp)bT@BL&mEdYhNTTaT%p!X<0A84FHsZ##xy12=E z$-7THP1$m9)Wn8ksgL$R$a!3oY_8v9p-GbokaN>uIS#Q2f}N^KC&tAz>j`lelpOjX zltjoiz^Vq(UN<<>zcS|EK|Eiq!v~^?1l}PGl-GuvGhZ@5WC!6G_G0k*<+MuvZVLr> zart*q`9KWz`R)rUW3YC|8ks1D`9DijmW0)B$TlJw#gF)WJVP138vocLI7fgTXm-vX zde!An`k5}(^4u4={(MpbGxB5YTt*ctfGnv=xH1V9o!8i zEY?acub6u43>gU97rABzcVo4-y6uH`90)m>ax^weD=@;uw-jGL**3C3!1ebNrf$F0 zorBt9v*$v0mT#A$Pt!GWy4ts|&bww#;JTF((D`LLEBS2>aM#U@+UdlVz6$sJ3O#sb zc?mXajy5@dMysf$_U`&t0&WXGkCEW4v?|ht&K7RXCe&tT&VF=EAESJO(&Eh4QYDS| zX6^WRRk1*KZU|Gk^RaIUf0wcHjeTz*HmLzO__4Nr51ft2#3tX|JVZ?Zp~wL5Om(!@52VZ$?6KS}oOwxz3kMY-(Y zn|UqS+3Txo31fjfXxB{i?5_M`1Kx9ZAMnN=WIr z=S&JlXQt0-qtW>8%H#yR*U)71ZTQ;c^pe~1iTUI}Vd-6h?aol;+4OSPF-)2-q~kA5 zcf0^KF6ho~a4YLG>4b)9Zlw>5b>bI42evqPt&Zf*ehQkec=NKI$-L1T3>E1nTV|Q{ zf?LH9JW7|+XHiMhf*;OiP3uVJ*ue~p4D>8UlCi{4MuNKol+lV$NkJ+8YL`y<$8ho3 z7r(}I`;1%ZZ-_|M3zxdh<7ImE7xW#gBihK?ci9f3zu@Uu z&440n-(|ZA{X2@n!$hbixvI-mN>wUVH>xzNbnhE*foOoy59-8`edK_#i!csID=7lb z1)>Ks`_x@R_E7?Onf>ctAo~~r&*fp7kX9;$0XOJ0FxtPa0NKY37?Xz? zKw7C0aBh$YkU5~P4%x>F;8B2?L0V}Le}u&F8>HU5V5iD;N0C6|BkMP;%DrwvNjUB! z`>^Vtro_Jx-kN?3Tp9liqWelsga+TrrQvy*vf)$;R1?%=VeQ0rJ%#sjH56|2hjH<% z_@9ZOf2;X#Cc3YD1vRN=(OuAF_Kfpq$;z){+qglLoWB4pzq;{v3HE=F z=zikK`0T0AE$25&LMw8fXV0wrq;j0&fev29su~W5%m8S(S0S)5ozts4^b7asrH7GrP`aTQ8P4SWXE)N zrwIIc{}`nHmvhZeOs9||c1Zfa^PsnFZy~jx?$5^rPE2gz>^5>LJUsF|$}1|QFnBk7 z#;9?r$C`%ohW#GY%pO0*9JUK~#D5&@_R;3oz^;(`sO7*TO3YPo74jNqA0Ia&Y-q@a zk@}22YP(f(+YJAN--Vx3(Hb}|L#n8#3ywFeNd?=kRmop^em5@wc$)F)8;OZHzK}^i zn&;G~x%nWhFeeIbA9jm|s@FUb;iyeE6Lz@p@mHr-fkCIyhj(l*z7^Gd&bzMoscN9U zQ?_unlGg~v^{p0?kU7I%AZjg%}??WIREwAn^KUsPsFF})2PjjkpN-TFVwYrZ)!je$I z>S61t{OmT58I)v9EKp3W3fYob$wo%!7c6dWZ2{Gxr)M5Fem{D48|S#=PW%8F>mU0u zBx{7>=z`VqBp};VUX&zKn&DJg03x!Uiut-%lI-BL&`_wOO#_N8eEf#BM?L>-j*qktP3zly{Wo*Yi8Mwkmh*&Ns`8@h+ zw)wuNAjFTyd}!~DqdZb2cCC~1$eIp$^MG*8C!(3@B3mJs3D z^2&g7Dz)XJQwSHUaDAYaj&N^v{qbqgaPfBH`*Gy3ypYK>6YPe1BMD=%THxX94%07= z=i9rx2~JNAo}@T^t8_xrB}!gb*L-;uDY0^onOgmRwh~mhMkni34aZTCE0n*c5S0o$ z_Y7t%n@+-}d4qy%IL~_BFqQHlkvbsS`PH^+57;9$J-q)2n(eUtWa8^Cj-w4O(?KGI z#y$#NlF7v%S|;2w5^u0A*L(cPeq^5feTmq@OVpF;Q}IMB)~p%R*N)`NFCJ!AxNn(t zg8xAl1722Ax!Qlw__2lvJ|jq0nRn!VTHL_&Z8ibxxBaNW&3B{G9~xVZPqD;ZI(m5A zFE#O%JyY}NUP+%8B`=LElR8Y)5hIBj5)bv@O~Z)PU-IkuV_Yeqgr%D>^k2t844oS!>-#CRilW`m8}4M*Rp<)Elqh+wAyQ`bBQ>#07UY6Yt(Z+-GY$dfvD945Du(dIHtB z>43}Q#J`OJs1S;ebNn&1}}Ovs5C?IT;uz zmUw2yu8{KvJSR?4Z2m~?@RFBkyo~ZEroJFMNiyCps?U*NjTLl7@dgAzH|V188^x5E zn!biB?F!UwX_R|^5m~(%M9UrG8sZLu)Z)n%Ejzn?#blmN(zb?OUqUqkpZC8JxFAv+ zl^QoJc8{da?~l8$(dP>)>Wka%s?L)8#H_g!HZGRSWif zcF*Iu?fbmCJYp&V>`huff@4}n%}Yvmx)V1gnbl%Eox`OP-Fzz)Ga_dw(r3C)urq}} zk~|}DG_25{zDjMWcJZ!o`go{E+@@qyDc-s`J7Kvi=Eu}v9(Ora#=8W;O;VDHHUZ3W?(~Gc{GlO&1=O89K$fTNuS?!|_6Po2+H1B=h*u3F|u{rh{ za)dVZ?BPdgVpppu6Oln~E%Hjx6xw4ttRl|24B# z)*uq`#tflqKgnX}_$?9VD*>sbNyfy(Z(+;*GmkCL94QCDBm<0>^{LznpLH2+GmT!n z%Q=&a!xwHVz+Ufpz@K5I1P zr-iEHFcG!Tzc-rWt#92UZ1=2QYQD5qKOaZ8#ivIIV{Fr5#LEk<<8?!NM`2u(cVkDw zoQ^65zAKlGw$r=dGRB*D;ZT0G03DS8NI!6r;mXOTq@q-pku%gAX?69YGVcrupU&7{ zLSHlFg}tL0kI0;2K3gd#BrNd3+6<{o+JEc5(M#$#monQn!yuLW$Sn&(A@j4A!H6AQ++?2T#vqz-y}sdZWiF66mZF;-QLYd zYN~j%=NV4xig-Jtd$oE{2AIMzAJ|$N7HE2Ol4aZ!p0WfgjYWG0ob)yt?_bNlupc?u zC4u~<-MA?JX5^@MfAJ-3Kc$Isv_3!ef+B#*_99d|d<3BE(8Gq)DOH)_-ZDDn`%54%>{Gb%b{it_n|=CT zl6XtO-pT%+*Tk1S~MjvK$AInRL9 zLm`TU>q$8QQK9uFu0uCtXv%tD`NVo6E;R&wvL%sDm^Iwyv9q0c99KO=N%wM@!zwZz zZSuhHX68b)C_`jj1W?}~lPWb+i_ibg60zJ(U)Dz-kFi9%V=OBa9%dT>GWx6*NyD4E zAI?MXnQ@M_Y&<0=c^}4lv-(DI43q^msX*QbL|teDUka?(-5EKAQT_;?6JO<*;rXNy znjvJAtU+F~=#!dcZg;W2(hA>FJRt`iM@itx_qxy<>NDN4yRl~@+v`G!Nf?Q~ZTmd9 zi*aw;=P%3O+XVWLt0?c@_g7T*$>)2OgG*(<^d*{PU*-Fyp?!+@Ugh9o)84Y$x8krn zO6>Y_K;>5#*}fM%2ztMC%Z2`X!EY;8->V!XrC++G%<;X-0k^&(;`f3FGs3-Z=UY)b zu`l>NZhbFsFxA>i9KV%f?tOv3Nh7~kI5;-%rDWeq$`4-@eooN7uX=Fc*c-gRl|10j z;rKd~e_#1v-?=xod@E{v`<4IBNb|kU!H#LKr~Ouf_p&8 literal 0 HcmV?d00001 diff --git a/test/test_files/xlsx/multiple-tables.xlsx b/test/test_files/xlsx/multiple-tables.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..b3cd5ad3431cee80d105f6f0e594b83a9e3b1676 GIT binary patch literal 4854 zcmai22UJtrwxxGSAk@&MNPla~&NGLLz!RGBPqe1T5AV?@Tabul-zv z?L4i09o_7`{&OL6(a+T-cgWnW4Ms)!1@xK5aN5y?8Kg{aF{A~fndNh1O-kscDVPd+ zb6wIAfzRREC;xr(wVQYCHx(ODAkS(Q9dQ6d&6YV2&8B;Gnt(KN)GDk?)hw~Y?MDrZ zMA=|t@ws(`MR67FU-vL3*Kc0AJS$4I{I-UAk*g^A@)Pm_b)TC9&sMK)E0AR)+Fw58 zpI=22W)m*#9BYn-*Gf?AE9SBtNOOlX+8hw?-<~afLRzkSMduK|GZ-m`B8ZYqhrV2c zjH@Xf$&yPSgLgl4>oxa|SayG)@4{!-#$2dZ4(N`m|9;xC>KmJoh=Vz9l&Anf+)IMHPi)&RlZZsdLSi_MZ7_fpIoHob#a#lU>i7UZh%+(I6og52wTl#1*;dLM!G zK4q-Es|u}h2&KpY->q`&jIGT<5%Y_r#&z~`bU`dp@GTDWhD5E5ZGS} z^8j>Hqp1u=V){-Ho8;z5=}L1go8`*^m0W|I<97Y<&C3kam~vg#=|jG>r!N@&wDaD` z`@5rTHxaU93Gu`RFm2*Zc9Cz35HO>BO#$29EVOaFt&@?$bZkP&|2$rKcy0Ekty zMo3>0%v&UeRrRj)(9jKhxYI<)2r`MQuI9DQBhzepswHmQD|II)(2k{2w=FYyhLUzf zjfBVdfs0lE6IGFHq@tez6Pua~%0{rQ7=JgT$3%#`<`vOu$k~Ph0Y=%%Q)&8Akpr1AHc7VIWg-<_E z8TR_tq}vo1@CbQKINyXGZGtrDR^2z`G%;MQT@|a`DnySFNID)jGb3CBHS%xsz%2)v^Z!(FcvfBewI%!tg7_e$qLv|pTdR_H41 zltc+qWk7`~(znBVP~jj{Ve-Y~8Xyps1Br_Y2SR2)@@}Um4=blehHNW8N&`HfYcfBkm2IBX zqv3#=D{Gu^O_=4AC@cZ%Mq9wQ2-^^!_~eS`S?2=+!69J^S4GSwy%>+DpH-5SfNAR8 zE3S<0%J_xNushY)kDHhW)pZ~U8`sQ*1M9-}O&aCt)?Dnz*&Ao(?J<7#QVr{+VWW@2 zi#`X1)Vj{hxH)?h#&{kq!Tn;DItQK{X5~y=V>wRQX(Cvjf4bfi5(rfbCyZ$L2r1&b zVnHkG258S@-;J(LZF&n85)maFv9|wE(z3?$q#$1JI$%2{xk5>U?vf^^Npv;F$0^y2 z*U3nD6y3qLCNf)pnA^LUIIARNyT)LntKUUHU#}BIuI0J*n628~-w(RBG6f;MVA`-G z<&&mlY;3=?mniu)fRR!^q(a?NxXq|ku#KHo#MknCTkSxh&82)BYx!aV7DHW9;@K+a zZx)4Q(RliF>35$VYL8q&6c#6sQ%|yxyRYPy4YSjB`mCW~wbn@GVX24p@B5bgp5*GX z>~tXBfvGo1dS8isDH+IF6U@eAa%O(Ac7wWFX^WguImEG)<9ui-=ty2l106ACon!zz zdV-SI;F}96%9>ca%_89ZVtiog;N^V!`>sB=Ri-QzTJpv4!rZ6s3_Z)EQpi5_AcK5> z`ZuzT<)o8VsP6I9MG$k5lH@p*zMIR2WNg#gQ#llmu5c+OyBK`*bU6^Lny-@3w0LOFANt2 zbg^&}b$V9EG+s&>3AB}3-ZZgm6x=P{*PiUD`c8@?8oVIXS^#^NmC^i(X!Jjc=H+1R z3AfYt_Qd%EoGox2RK%$Z@zANd+=mLBQ-1}N4K3#@DxelmnaDy-Prph;dVimJXdI~D zr1$K>m{-4@wPEU`M5S_`4?;QGT}w3dvBeX0Swn5CEYLbwe}3mJ8_GDfy3uUqupX}@ zna7p&OLG1!w!x!QyUWgQK)x3N=IN%nfxJ500A5DPMe zjZ0)m{zUr4{QQisK&Iuie1TZ99tO3`Y1_NzC_PtE#gmu)TR&d2@D=V7c`Sz{fZ8xWg0_~nRKQbpflCZV# z9`37b`URiO_{!#xC$bS($2*HHe zz%MF<@wk~X)jX|QVY2RvyAe^Rb%#Aqy^9JAM_*0kd;j&}OQj1KeBq&f_O}?@;^6}$ zegb&>5Axx03PRBeDcgh^#u@AUtkvm~_fC@$?(qqvLT?GrkIsUTm2c(IFs_Et+q%Ts-XCk)T zy`uJV33_vc)#*OONG)GulaTl^0QvDe^m+uOla(nox}r7`*s8NVxfR3{1!3P8t=UX& zIOg=!eiv91LX#7qBzxWC&E3JAA$un}GPQk_faLx4*6nMNC$SqZ+9%D28ontGr@oUZ z46B0pF2WaGBAep$eNMg$onQ|eQZo1NXxc4opJ0gp#0kL9Ucnu}Htz1ur!F~J1M8B3 zk=jEtD$`-MDuU5!V+Lv1_XJ1l7|3Ssn%#ZzspgbKm3)`{^1~c=x$zUX8E~%nY?!Cy zd6%qYqEZjNaeb}kmT5-T)irl107 z+zRVR8TQBUTs(@2oFL4|BZyXgYna`vgGu9)Qu_D_W@ZepyNRNRq4JAH_u0AmW{!TfY5HRQ2h00e(O#AH&Rk!>&9D+ zZh*}-Baf;(m$y9o-tRPWa(HRf`R)lJjyYorQ!C|5B!UBDznhrd;1vUi9d!efY*3R? z#U16JqO3m;vxsEIQLL7P@}{$z1^L)hRZqr|#=PN=8Kl=bR4gagTaHfBfl_X>uMuoe zmS3HTjMt1k_1T)e`DNFzd%*ukpAG)G2S1(9*_}A`K4r;@P8b;QDt0@f$49GD^sdL` zXbRX(7Lfvn$3!dnxHTsZ_Hrd>F9VgBEMiVN4o`+0s*k_{TcHqNv|tH}jwk@+^2wBV zGb-c9d0^tIyO#BlMaR+()nu5sS4(h*ndkVtjJT$~2UEYJg7z&a@74v+M_NM3NK1do zXGn~Kg}sqbPom@J@6_j`tD6m5ENZ4#!uR6ng#ENc>fF(?7sz*5hDBsl4hXYKISu5B zI=W*yZv~$!TsjL{e-&GgdaQN)cgsYH{l(96{cR5wX^?*l`0YJC59@n*&qerMs5u_h zDsw9oJ3eb$U)PJH%t-mZ6I4{1;co-@YRx71sN2)Zfnfy=#vk2lyn;umYa(>MO2DcE z@9qnTuyjwRo09y?Ryv!o5II%fMcEP#dlpeohqpyM@_m`A?k`iEQc1pF9NeF3Vb%F7 zv-Ax?U;(Ao`sx?X`>)(|{LbBd_@XCe+tR?_1MX#xBk2qMJ}qol91O^@P7xJfUW6R}3B z)WTMo0owknf2t{s$bC5*YUlg0Q9&(*Cg8a=`nk1OI9mZ!;i8>RIeKEVB&P^GJfZ*33}i;3{T6Ndx8XXC2#aW1O4TeuZkN4|2!r8GH z_Z;D$@o!HRen&Yw`TP?_1N*;Nlrz)LuYvk^l(Vw~Zc6$UydCx z^((5drN9FIK7jlVaMpa_tkkbyB|8Q9KgQ~JgtJEy&ZPVbLh_#o|7%%(UwT$aac9=A z5XOqZ-{ttv3HJNqv!a3%+h0*lb-MUpr1y86vy8<_(63m+rgKJ+eqVUD>Hb;!A*^Hl ZFZ9;YAjZzkcz6`pS2k7(Re`v>{{d#~$0q;) literal 0 HcmV?d00001 diff --git a/test/test_files/xlsx/table_empty_rows_and_columns.xlsx b/test/test_files/xlsx/table_empty_rows_and_columns.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f599b8c336860a6a35ab60167b2990bd74a54e35 GIT binary patch literal 4743 zcmai12Rz$t*EVXeklM9sQ$_7jn~ROB;lJ=`sdBK0b!GK$J1Yg`h!SdpYu3yIFeL zIm6ulb0K)$%gHfw2<+T0K#cbp_=(hD#?FKWsCe0INK=4xj@_9)?r|S!&XoTaMR2!@X>6zS_i9>^lEJ3@ zs~d2${7Q1aM;#{WcWz3~-5_3lUrn;aoQISwBp6V8aA)Afy7G=3ej5CB-BXT*^)~D@ z?8V&^jj@m#5yC_H%q!o-Swg5F-*66X=86mPN_B2(A7OSq3=)dO3Kvb{sY7v(t128z z6NsPC?N9gUw)Bmd_e@{z#$?dyxK^*|(-U67Vb&54Rq@;mthL2v2<-2v6aTU}UBS~U z`jU&+vBGSo9zGt#E*J+B0|O5m14H|NrP9;I&B5Ik3iWX3KmYiV%7jVh zMP@+QQ;XTQ1(~jauIbTAaA8T&2H0R@J`Hju$F&@TY3D$SgIj%p-eJq)%g6MmmP*^= zYUCtuj7kj6h0bjG#-wa8lpTeyA0UQo<`cM+;=IoMxk2yEG5& z{ZhnS_6>$}4@P9g|9akh-{izPpMpEmel*ss4#sz~Gw~8I+2uK_c%=APKBH5%l6DKh z|2nQnz~_<0rl+gCB>$5Ku0(^Omx$hFsS`>Eu>0JR=BKpNwwS}QrFxdCHU8xFy$9HD zs9HBj0OsPQB((D`5w{1jz$+!1e%nI4@z*S`E_(6_@} zgiNT4K2%(SH){z;pt5hRmy}{)`d%|O70@KQs*24r3tywTNK@FVPwZZXuQhFjPJ0S+ zmWX^r6_?fXiKC_uHF2JFn7o%h^%YgeNCh63WW;^wZ`_pctwjt77`^4bFeix0syF%Ec+Hjj`rP8y)gr0Q$kZ#9u?06$v@lg zu2Ga(E8M_JkOJ~Hg&`6;30cUwgs7;!-# z0y@%qm$I8<`%+%Pvkdn2_2VXB|JqJCb`uKB?^_#uXwoE0fpUb6Gc?UEz&gBOVvQTc z!DG)t@;>g;1!CsFegS=oookzEK>kcNN994-&jXH zut%D**&FeXwRK)W3C`6YW%ez_&MEL&p(r6bdfix;>$SrPG~H0ouT;5sd-0&wrnvF0 z-ECYJdyuGLYz*5yh!x%Qp(4@?C{r`%Z#OFDZD$}C^fc#euNlaNNMu7SW%IFU4Rr8u z<|-Y&nC0R}VCYdK-7h-Q8o3G2%}0)tOwtm#tYwxAGmv*ZKt&4FSOzH$i#@F$?O*mP z%+#UX?S%hDN3w<6*C6z{U?2mO+cj>U{}n@8CVQFmK6iFg-%(< z=?feeM#_TN=L7OmCr~!DTn;bC2d2K&EhLS0_g`73PE{f&SPIF_EOMdjT^$n(>R0pE z&n8p*g1@;MciPIMb24=uNRy`^I!>(T?6@f!)r=~ViDc!MJ`TuI4noC1%vLB(AJa>y zTrshe&g+PF=sf&4``yvR;H)c4!A^U=i7cWG>Npz4lUK-vQn_f-M! z(wzYPw7ZP)$rRmbCgwU2NFRXaLxGpbkD z+UObI?ZK%slADOQdu<sm2#GKz1L=y@EP#zF+FCO{46gEf2#8bl0^vg zMzSz}kKmLn;}qqvB%{HL>d5!Sk)u#DQQWI@7FNn-)X%PW2zf8i4HmWrL+*3S1#|k( z-l?{W{Hj*Itg_MyFRRlWDz?ImuABq*+;H%Rm8P_pxAF>ux{e4^3ae$35pv)K|LnaQ z`QU4)>804w1qPcmiA}P1CL7^j#&_4)peGi~RKDThs?nKYO7*NL;33;ff7Q&L99bd; zIMS-cgR(OT^c+RuS#xcpP1r`H&fTZsmfC2~?f5L*%Ea|2N&TH#>n*2^4OXWrnD=`P zIn|~Sz&*aiwD^v>M87HdoCU|l*UId}BQ*e48g+l4%?EB7({}t;ryTW@4_OWs-kwqk z1Q-rRa1zMepC;n0wukgLKk*ndb~#pgtUJtXu;-BGr=bSdmgGtIgE<9fOnZGw`E>JS zZT(zyRS%DDuAwK*b(}xd2-%MsIh(i;7Z-|Ye-6ICfG`eH$wG;ej@rg z!Pxo%2${cSoi04p5+AFS?U-G9l;I*Xe(F3+mnl3K>?X?Tn2Nkq?5aDir`hsqhKe2; z(o?cDVcgS8i8W5G)yVX5@Ycrxg3-y?c%ClF9N`E;Vlt3naA*9mcL(eBV?@{lc5)V0 zgvxt^v=;4-M0PQSm5&0Z#yps1nx@1byn?-kq_h1B^chsr?3i3t2)2TF7j?yjVu$3k zZE^M#(xq5fS7>NA2a|mQd+7LL)L(dQG;`dJkB_PyZ#BA22i6EXuIgIdcIzMAZDL|{ z2i1BW@WD@*5V;8zvIQa!eWSjbnBHa+A`?390mMNfC&TkQOFxENej26~Oo=93F9>8y zqBr$_03)uNjK)K}<3J2v);y9gCD2`sh|}gF+F?LpZ4#9#&xXZlM4g|3MqhTxE%X_1 z{89MQ{WyakQs?4HoQs_jq`ZBH06-ai91-JVmGQdjhzt!b>&ZMk!0?9~73?e;6WK0wEhcX77DlfQEUQmbnl$QUUfrC8pb zbuz0Mdo476Uyz7&+c=Ola!}W1(RHXEpZw;mFbheVQQjRDr40+hLuem+bX0*o8S(YT+I{*;!WmK3 zV(`kWdS)%;Aew^TOH;7crA_)8!7lBvpp?=#?9^f=eVM$@o+u{6hvzymiMCB|9i5MQ ztUpyP5&9cH()G6e;!!j^KajSh3{py?(Ck&X!;XC;?KC&vAk-2CFbDb5PAb*&VT-~bsF z&n4*sMi}i4H{17ltg`(nDlT>L_6fLOuMZwhy`tClODTSb<(orfv9bP{DZRm2+w1E6 zr|-I$D9sUUAQu5bBaRa`*_nv0Ygw%?kPoGen$e_$6L0#oOB1-ZMk<;sFY{f!ZiJ9k$#FDJA)l($*2AE-iP8q=;H zkQdHbYbDyc#T@D)?T0WES+ZfzNqgm2peB9z_BfyfvBm8Y{WjR1cfws#!E_l2i9>o+ zxc{r%(o3&xql4bGu*D-5?7A&ns=)oSOHNHPfLv zn^R=h%%Z?g^M|FgH_DCcOJtVqv< zx#W8PCTy_fJnK#o)!HKHjMJVQ5lpPh7(b2uMW1?R?0=1av-#SfUr{bvonKKv=-)-7 zT(CDk9q#Wa7tO+%E&3UnXfpW~<-eMw-vKX*qBD>2GtN928t`|=@jJjp(Q&4oenunS zIl%u@Qokcy97$*5&o-VbAO(DeC|h08E}poi-3RtOI$!gAM2FS9X})N z`7E8y++c8Pmw!G8@FE=SQwEISt^6eTpMjsqw5MPObU-<3%_Z)r#%S zPL^?Xi&SxN+m{(nJ?pDrtuawk#wWlxhxy}TV`1R~u&~truT%>DKb8M2m2QsCb}lwh zsH+R#$;Z#AjOm7F^HT(+nNDHvPthqzM6<9cu_M`QI1=YS(Y@1K!yUC+u1RZEDK_S{ zfj`(BUQ~sKqVJ(t8RE+fi>Pw$`LX8am(-2LF71{Mzm`H4g9_PYxpguD6-#9YI$>j< z9D3tJvC{2B3>NDzSSa6R4++2DN$mxzjC$2F^y&E;38=~ug=4Mg9L)BSh>5UHzimLJ zjni|m^6Bk5&@28M7na83+;&3Q^SG1e-rE#0(rWV1QqigPvG+6%9416&+l7GzzP%X7 zDhGtB^44BMl={*uy=RQ&3sV`0_jDGMVb0zoQe9q?*eEiSo(UHE#yNS!%&Fp(Xq*o3 z-o7$qN6Dbz%)~y?^+mq06*R4)y);EwGSU!Y&u6w5Y|Z{vj}=j?{Lz2IK$O-=duCvf z)<=wSty>~7Z8LA^oBNhd+SL>*lf%Hagvy#by=k8i^2p4@tJ$>c!8AVZYXh;Q0Vp5tyxmaoh7F#_ywG;?bFrC4*p-CTo zi*DQ8!HGA*t{`TS7APU5|2R#Nkla?=cQ@rAByjr1HaRAfQR}| zv?`M9juGkT*V(`U?2S4-b^($Yu+!04Qya>fM&2l-6uAw=aPTaDRi)zCL#|^U{$n*c z#lcAm{H!L?z3^EU>Y%jr4b!qGA@8zjdmmhN(5`AIw6z{OH%k{7io1uZ*$}v={?@*D zv}JQ&Mw-cKhhKs6exF{6b_5C$a-0IZbz}2Ot6R`}!K|x~xk3hz^GnTFJUB|qjgy+5 z(YsJ$#k^KuGmJL3hrI@umBHiv_+mv+Vk3!6)!}B@UoUZfwykSyK{*9R1ibGvTPj4y zNZE}k^${ootHRwEjXBBnN0gDco_JXfZE^TIRJ}dFrCKK%b|VkChBK=?92SIT`#?&CXdo zbl)HV50xBDC@Lrw#k$qyrfFG!-;a9-&N{9{cl0reaB`90;=0z}Ov6|>uA!ya+;=pe zb(56-ty!Td+a_jF$9esw%H4Jov1G-&tCe2RP7oD!Y%%LICsl**vpGc>)kwz@xkI_# z40qr(9MX7M8h8uYwfSA4?4~fPfIQxo$V_J8ja{SG}$zWm@aGsF%&CkcQ+^raGwTXUowcAI78 zv+k7#fv-@fmx#X9;k%fjU6>PTo(7|7+cG_yh~Zq2Cbz-PAJN#Mz);o}|re zpuJMPB2>1S(=vi10y}y^=jq_3oM{Dun&8X|1q%r^NA?!Iq{k%>2o<)rvoX=5prDyE z>p@@rzU}hE9s5yad|>=SSn)?79OR`KQMUr^fcVg6%gWlFb_q9%69Y@ zd{0OoyG4H#z14)Xgf3j`_VwaY2mk~&EO6$sNf}d0!pUF7Gi^Vvh-rGq#UmgH_+Vx| zng4PXRFD;+p+x>A1XU`lOeL<;)+D$R^1v2l2)5PZ8*XX8uqrT9_dTU&E^X24t1ScFyS5N)$9uuj_wGq)#V9J{3~}G=0zXG%IfP|Z|y{ieD$Cq*7Pk^G~sL2 zE9P!xq7-m5x!hXapJOhbX>KNyheNNej*mN2ZntNgL+}_&lPd0h;dj*!Ql2?^s8P~! zdP2wLlxJ_5C_5gkBKfP${N&z>rqvDg&Lav^)akd{J>N5sZs7IQ3w_S-PhRCt#G($yf8wAP z9XHh3dtrq(L7tLuE+8kR(D8it(y*vsui`_kOmf9Ng7u}SAI)6qM-#k2x?EY2Q4&qK z{klkK(`unK62vEY=$moVZ#BZ)c;UR^6Gm~!1p}B2mvCHb1Hvbwl$f=uso9od@&wQ@Kxgw-dJoYs_`Vq@kEwbXQV#!LYi#p>|p2Bx{<4xBXC zBrXnC#d-?KMjiS^Mmp)4~?U?jP*%HtsPqJ#~ zJQ;Op-dIh-Kr16XR}Ft(W{0jhakxUwaH5=lw@cKu?6SIfX?J=HpW%t^B|A6;Ta|}V z+}#u}u-YYZFpWq$9`Xq``O8lvx2#_UP8c^1qQj0_1BQL5EfcPhxag_oce1SmFn$6r zE)VA(dx8JQeoFe6pRVrq&=Uu%v|9dEN>bY!kcI5!O!Mgcr#q3)-g9?6hhTT$h*W*s zW(u2}`~b0~E|4DSOZwPEZ{@VlZy;x!L(X7$vnI)pYWbY&7!%il#q@+%23USw0^Zz> zydyXee4t5>ww!a}gMzzeWN&_>3-;{rTgsU?1Ngfxsn75{u{i(``#)_YhDBes*QHKN zm>(PLbCQ8O*43iR3=jnAXoxQ~J%;8>;&Q~K0Iv?ar^*&YBJSJMSttKWz?1q9kWN@X zawrFf7C`CbT9(;>o8S96snOYgQETQ#Sv|FBGtkF|drY|fJ9y+Cx64e{@C4YV-vayD zgBf6#pxs03lK57&8G!KCWtJ3?1wprSD^#IjZyWGP%Zqp4`5KThCczeJedHOvmu90E z-;KuY`}gqdk==rQn0k9&DI*j}9NQt8x4B=yD|_pvbKh`DrdV)@tC2ah_broyg|0uj zf5JrpHfv~Zw;EtnseaK}|NCvXU4cpUCZR|5b4AhFQSElyQi`vW!}&pQntXV z-WQ={m#jro6Qi5*WK<4M;=Wv>yz#?vER_7Q%aZ?$2aLWlcXYHn8PZUba6f*!pimDUJ5L!0B%e_>pactwf>2c(+PvkJ}wuWhZoI!V4Nv<64G-=rMTFD?+VT(IVMQKEm zos!PX1s1`%zyC1pnc^HkR9tuHx)Se9H?QaDk%60po+pyu(Mzd^qA);%PibJ{jRP*8 zF43wmx!C<%E8O^!yKL77ki6)h~#RR_w2)|J!%JU>}bI?!O65GK&B;Nm8sD?lbrp^yE8a%atB4$Gs=v~&XleOclG1bu>9lf%<%Ixe#-pQXL2HSnougXZTu90ssq;^ zj}DioRL4+>CsB1(tB|u4mqE>yWfPC~c2Y!y@3X}7-quA+32b=VR8D!3G$GpJB{rz3 z^~rFI)>IY^aQVKiZA(3tAETf23?_oyeqK;Aq&i}UPOxl-AomT&#CEfZ-8?P zn`v5Dnk-STLkG`@*)_DhZ|fR;9=%!$6q%w<;g*Y4#tb-O8y8j4oy{sW!FP4dkfnOV zioY_#%DESJtD;McWDXkStQaJ3@x~G1C$>^|@F8j)49>zQt1<+B=d!%;>$5FkUugt+ml{Qua>)XJr7pYy0)efPW&`_f9qjx@l$tm@~>tr-05A;rZ(r7TNtp1 zcN(Rzh}8{vE|+tdwqxC|;^Y(Sj)%+2)HYL@`};~m+~y?nFIv+JI@`R<1@ya4LU0eCgoKz22L$KaY8|+{{ZxRAlcH}40V?P4ROg-+n zlF?j*@{!bQIv#=W39=nCSjvwb;rL?-$7sU4a*l9UDBRUV%iRg;VuVTCcCB$+9080b zY(ov3XG+It7k!BCXsLO1uUYnsCy)pwIE3>K(J6@E zEvIBI$8ng?E)PX{_nYAD7N-;F_&uOeu9h>eSblB!VoTvBQMFb%JLCt;Ic-t~mUYRo zfcLy~k70~7t#VZXxX4W>#APmP(3sd`9{i*`sn3IABib$8rZk?CG!7T~#e0RwcOVxX zaIPh-8Gi@C9pU#Hh!^8v1JlngUj)&9pgifPHJ?*IpZU3?y-Uzr~ z`n8mNaJ&AGkLMkU7PTS@cudo@@ai}Ha#F3~*GFI49`@GF-!Pf$7dgbk z@UO1<>7C3m_w=|Jsep|`jrHrc@bu*u=C<&6`*(MT|9ceW77rxf6TbGLH9h3gvGv9?k-up6hCo%zXSf7;?JU-9*ln9 zRy6)ulvCUKH4&Z#JS_k*x%1272~GgdPFH6EPAe;nPW)xqBqsp>Lp7d7INgge3hb9H z|Ap``J$82K>BfO+*1v53SkV4O>OWie*~O=8C#JXjvS5mn#s8+$oW(g^m@t+0m$gv- jR(H=XJe^U$b6-sLw^d9{85a+OPI~-uJFfUwP6+=83K5}# From 6e4b98541f32680467717da30dffb807eedad0c4 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 11 Dec 2024 15:17:23 +0100 Subject: [PATCH 11/15] Added more tests --- haystack/components/converters/xlsx.py | 3 +- .../converters/test_xlsx_to_document.py | 31 +++++++++++++++++- ...tiple-tables.xlsx => multiple_tables.xlsx} | Bin 3 files changed, 31 insertions(+), 3 deletions(-) rename test/test_files/xlsx/{multiple-tables.xlsx => multiple_tables.xlsx} (100%) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index d36c86aba1..78572be8f3 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -164,8 +164,7 @@ def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict] resolved_kwargs = { "index": True, "headers": dict_or_df[key].columns, - "tablefmt": "pipe", # tablefmt 'plain', 'simple', 'grid', 'pipe', 'orgtbl', 'rst', 'mediawiki', - # 'latex', 'latex_raw', 'latex_booktabs', 'latex_longtable' and tsv + "tablefmt": "pipe", **self.table_format_kwargs, } # to_markdown uses tabulate diff --git a/test/components/converters/test_xlsx_to_document.py b/test/components/converters/test_xlsx_to_document.py index 784ae4c43f..72964381bf 100644 --- a/test/components/converters/test_xlsx_to_document.py +++ b/test/components/converters/test_xlsx_to_document.py @@ -14,7 +14,7 @@ def test_init(self) -> None: assert converter.table_format == "csv" assert converter.table_format_kwargs == {} - def test_run(self, test_files_path) -> None: + def test_run_basic_tables(self, test_files_path) -> None: converter = XLSXToDocument() paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) @@ -33,6 +33,35 @@ def test_run(self, test_files_path) -> None: "xlsx": {"sheet_name": "Table Missing Value"}, } + def test_run_table_empty_rows_and_columns(self, test_files_path) -> None: + converter = XLSXToDocument() + paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 1 + assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n" + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"), + "xlsx": {"sheet_name": "Sheet1"}, + } + + def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None: + converter = XLSXToDocument() + paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"] + results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"}) + documents = results["documents"] + assert len(documents) == 1 + assert ( + documents[0].content + == ",A,B,C,D,E,F\n1,,,,,,\n2,,,,,,\n3,,col_a,col_b,,,\n4,,1.5,test,,col_c,col_d\n5,,,,,3,True\n" + ) + assert documents[0].meta == { + "date_added": "2022-01-01T00:00:00", + "file_path": str(test_files_path / "xlsx" / "multiple_tables.xlsx"), + "xlsx": {"sheet_name": "Sheet1"}, + } + def test_run_markdown(self, test_files_path) -> None: converter = XLSXToDocument(table_format="markdown") paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"] diff --git a/test/test_files/xlsx/multiple-tables.xlsx b/test/test_files/xlsx/multiple_tables.xlsx similarity index 100% rename from test/test_files/xlsx/multiple-tables.xlsx rename to test/test_files/xlsx/multiple_tables.xlsx From de3e844b048b56745f59900acbf366e216d8a5b6 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Wed, 11 Dec 2024 15:20:48 +0100 Subject: [PATCH 12/15] Fix windows test by setting lineterminator --- haystack/components/converters/xlsx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index 78572be8f3..7a0fcabba8 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -158,7 +158,7 @@ def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict] metadata = [] for key in dict_or_df: if self.table_format == "csv": - resolved_kwargs = {"index": True, "header": True, **self.table_format_kwargs} + resolved_kwargs = {"index": True, "header": True, "lineterminator": "\n", **self.table_format_kwargs} tables.append(dict_or_df[key].to_csv(**resolved_kwargs)) else: resolved_kwargs = { From 9be83ae37a0eea1b7f21fa975367e30d5cc734f6 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 12 Dec 2024 15:00:44 +0100 Subject: [PATCH 13/15] Addressing PR comments --- haystack/components/converters/xlsx.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index 7a0fcabba8..4bfd667db8 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -25,9 +25,10 @@ @component class XLSXToDocument: """ - Converts XLSX files to Documents. + Converts XLSX (Excel) files into Documents. - By default, it reads all work sheets into CSV format. + Supports reading data from specific sheets or all sheets in the Excel file. If all sheets are read, a Document is + created for each sheet. The content of the Document is the table which can be saved in CSV or Markdown format. ### Usage example @@ -38,7 +39,7 @@ class XLSXToDocument: results = converter.run(sources=["sample.xlsx"], meta={"date_added": datetime.now().isoformat()}) documents = results["documents"] print(documents[0].content) - # 'col1,col2\now1,row1\nrow2row2\n' + # ",A,B\n1,col_a,col_b\n2,1.5,test\n" ``` """ @@ -57,6 +58,10 @@ def __init__( :param read_excel_kwargs: Additional arguments to pass to `pandas.read_excel`. See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html#pandas-read-excel :param table_format_kwargs: Additional keyword arguments to pass to the table format function. + - If `table_format` is "csv", these arguments are passed to `pandas.DataFrame.to_csv`. + See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv + - If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`. + See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown """ xlsx_import.check() self.table_format = table_format From 59b735f75768eff10b06ae27eca481c9c4391880 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 12 Dec 2024 15:03:00 +0100 Subject: [PATCH 14/15] PR comments --- haystack/components/converters/xlsx.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index 4bfd667db8..58966e8ab3 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -146,34 +146,35 @@ def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict] "header": None, # Don't assign any pandas column labels "engine": "openpyxl", # Use openpyxl as the engine to read the Excel file } - dict_or_df = pd.read_excel(io=io.BytesIO(bytestream.data), **resolved_read_excel_kwargs) - if isinstance(dict_or_df, pd.DataFrame): - dict_or_df = {self.sheet_name: dict_or_df} + sheet_to_dataframe = pd.read_excel(io=io.BytesIO(bytestream.data), **resolved_read_excel_kwargs) + if isinstance(sheet_to_dataframe, pd.DataFrame): + sheet_to_dataframe = {self.sheet_name: sheet_to_dataframe} - for key in dict_or_df: - df = dict_or_df[key] + updated_sheet_to_dataframe = {} + for key in sheet_to_dataframe: + df = sheet_to_dataframe[key] # Row starts at 1 in Excel df.index = df.index + 1 # Excel column names are Alphabet Characters header = self._generate_excel_column_names(df.shape[1]) df.columns = header - dict_or_df[key] = df + updated_sheet_to_dataframe[key] = df tables = [] metadata = [] - for key in dict_or_df: + for key in updated_sheet_to_dataframe: if self.table_format == "csv": resolved_kwargs = {"index": True, "header": True, "lineterminator": "\n", **self.table_format_kwargs} - tables.append(dict_or_df[key].to_csv(**resolved_kwargs)) + tables.append(updated_sheet_to_dataframe[key].to_csv(**resolved_kwargs)) else: resolved_kwargs = { "index": True, - "headers": dict_or_df[key].columns, + "headers": updated_sheet_to_dataframe[key].columns, "tablefmt": "pipe", **self.table_format_kwargs, } # to_markdown uses tabulate - tables.append(dict_or_df[key].to_markdown(**resolved_kwargs)) + tables.append(updated_sheet_to_dataframe[key].to_markdown(**resolved_kwargs)) # add sheet_name to metadata metadata.append({"xlsx": {"sheet_name": key}}) return tables, metadata From 669550d365ca719a66d9d1b0b6a650b11dc7c196 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 12 Dec 2024 15:06:54 +0100 Subject: [PATCH 15/15] Fix linting --- haystack/components/converters/xlsx.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/haystack/components/converters/xlsx.py b/haystack/components/converters/xlsx.py index 58966e8ab3..db7dca8fed 100644 --- a/haystack/components/converters/xlsx.py +++ b/haystack/components/converters/xlsx.py @@ -162,19 +162,19 @@ def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict] tables = [] metadata = [] - for key in updated_sheet_to_dataframe: + for key, value in updated_sheet_to_dataframe.items(): if self.table_format == "csv": resolved_kwargs = {"index": True, "header": True, "lineterminator": "\n", **self.table_format_kwargs} - tables.append(updated_sheet_to_dataframe[key].to_csv(**resolved_kwargs)) + tables.append(value.to_csv(**resolved_kwargs)) else: resolved_kwargs = { "index": True, - "headers": updated_sheet_to_dataframe[key].columns, + "headers": value.columns, "tablefmt": "pipe", **self.table_format_kwargs, } # to_markdown uses tabulate - tables.append(updated_sheet_to_dataframe[key].to_markdown(**resolved_kwargs)) + tables.append(value.to_markdown(**resolved_kwargs)) # add sheet_name to metadata metadata.append({"xlsx": {"sheet_name": key}}) return tables, metadata