From 4e1681c80092e8ee756b4b7070cd012bc4efa5aa Mon Sep 17 00:00:00 2001 From: "marc.torsoc" Date: Thu, 20 Jun 2024 13:14:21 +0200 Subject: [PATCH 1/5] add rc to partition excel + format --- unstructured/partition/xlsx.py | 124 ++++++++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 19 deletions(-) diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index f219ab9741..1b1575492c 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -15,6 +15,7 @@ from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import clean_bullets from unstructured.documents.elements import ( + DataSourceMetadata, Element, ElementMetadata, ListItem, @@ -25,7 +26,10 @@ process_metadata, ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file +from unstructured.partition.common import ( + get_last_modified_date, + get_last_modified_date_from_file, +) from unstructured.partition.lang import apply_lang_metadata from unstructured.partition.text_type import ( is_bulleted_text, @@ -55,6 +59,8 @@ def partition_xlsx( include_header: bool = False, find_subtable: bool = True, date_from_file_object: bool = False, + # TODO marc: fix this too. This is not doing anything atm, just renaming the number + # but keeps starting at 1 starting_page_number: int = 1, **kwargs: Any, ) -> list[Element]: @@ -107,6 +113,10 @@ def partition_xlsx( for page_number, (sheet_name, sheet) in enumerate( opts.sheets.items(), start=starting_page_number ): + # TODO marc: remove + if page_number != 5: + continue + if not opts.find_subtable: html_text = ( sheet.to_html( # pyright: ignore[reportUnknownMemberType] @@ -130,6 +140,9 @@ def partition_xlsx( page_number=page_number, filename=opts.metadata_file_path, last_modified=opts.last_modified, + data_source=DataSourceMetadata( + record_locator=dict(rc=(0, 0), excel_rc="A1"), + ), ) metadata.detection_origin = DETECTION_ORIGIN else: @@ -139,12 +152,23 @@ def partition_xlsx( elements.append(table) else: for component in _ConnectedComponents.from_worksheet_df(sheet): - subtable_parser = _SubtableParser(component.subtable) + subtable_parser = _SubtableParser(component) + c = subtable_parser._top_left_coordinate[1] # -- emit each leading single-cell row as its own `Text`-subtype element -- - for content in subtable_parser.iter_leading_single_cell_rows_texts(): + for row_index, content in zip( + subtable_parser._leading_single_cell_row_indices, + subtable_parser.iter_leading_single_cell_rows_texts(), + ): element = _create_element(str(content)) - element.metadata = _get_metadata(sheet_name, page_number, opts) + r = subtable_parser._top_left_coordinate[0] + row_index + include_header + element.metadata = _get_metadata( + sheet_name, + page_number, + opts, + rc=(r, c), + header_included=include_header, + ) elements.append(element) # -- emit core-table (if it exists) as a `Table` element -- @@ -160,7 +184,18 @@ def partition_xlsx( ).text_content(), ) element = Table(text=text) - element.metadata = _get_metadata(sheet_name, page_number, opts) + r = ( + subtable_parser._top_left_coordinate[0] + + subtable_parser.core_table_start + + include_header + ) + element.metadata = _get_metadata( + sheet_name, + page_number, + opts, + rc=(r, c), + header_included=include_header, + ) element.metadata.text_as_html = ( html_text if opts.infer_table_structure else None ) @@ -169,9 +204,20 @@ def partition_xlsx( # -- no core-table is emitted if it's empty (all rows are single-cell rows) -- # -- emit each trailing single-cell row as its own `Text`-subtype element -- - for content in subtable_parser.iter_trailing_single_cell_rows_texts(): + for row_index, content in zip( + subtable_parser._trailing_single_cell_row_indices, + subtable_parser.iter_trailing_single_cell_rows_texts(), + ): element = _create_element(str(content)) - element.metadata = _get_metadata(sheet_name, page_number, opts) + r = ( + subtable_parser._top_left_coordinate[0] + + len(component.subtable) + + row_index + + include_header + ) + element.metadata = _get_metadata( + sheet_name, page_number, opts, rc=(r, c), header_included=include_header + ) elements.append(element) elements = list( @@ -310,21 +356,21 @@ def __init__(self, worksheet: pd.DataFrame, cell_coordinate_set: set[_CellCoordi self._worksheet = worksheet self._cell_coordinate_set = cell_coordinate_set - @lazyproperty - def max_x(self) -> int: - """The right-most column index of the connected component.""" - return self._extents[2] - def merge(self, other: _ConnectedComponent) -> _ConnectedComponent: """Produce new instance with union of cells in `self` and `other`. - Used to combine regions of workshet that are "overlapping" row-wise but not actually + Used to combine regions of worksheet that are "overlapping" row-wise but not actually 2D-connected. """ return _ConnectedComponent( self._worksheet, self._cell_coordinate_set.union(other._cell_coordinate_set) ) + @lazyproperty + def max_x(self) -> int: + """The right-most column index of the connected component.""" + return self._extents[2] + @lazyproperty def min_x(self) -> int: """The left-most column index of the connected component.""" @@ -450,17 +496,22 @@ class _SubtableParser: element. """ - def __init__(self, subtable: pd.DataFrame): - self._subtable = subtable + def __init__(self, component: _ConnectedComponent): + self._subtable = component.subtable + self._top_left_coordinate = component._extents[0:2] + + @lazyproperty + def core_table_start(self) -> int: + """The index of the first row in the core-table.""" + return len(self._leading_single_cell_row_indices) @lazyproperty def core_table(self) -> pd.DataFrame | None: """The part between the leading and trailing single-cell rows, if any.""" - core_table_start = len(self._leading_single_cell_row_indices) # -- if core-table start is the end of table, there is no core-table # -- (all rows are single-cell) - if core_table_start == len(self._subtable): + if self.core_table_start == len(self._subtable): return None # -- assert: there is at least one core-table row (leading single-cell rows greedily @@ -469,7 +520,7 @@ def core_table(self) -> pd.DataFrame | None: core_table_stop = len(self._subtable) - len(self._trailing_single_cell_row_indices) # -- core-table is what's left in-between -- - return self._subtable[core_table_start:core_table_stop] + return self._subtable[self.core_table_start : core_table_stop] def iter_leading_single_cell_rows_texts(self) -> Iterator[str]: """Generate the cell-text for each leading single-cell row.""" @@ -539,16 +590,51 @@ def _create_element(text: str) -> Element: def _get_metadata( - sheet_name: str, page_number: int, opts: _XlsxPartitionerOptions + sheet_name: str, + page_number: int, + opts: _XlsxPartitionerOptions, + rc: Optional[_CellCoordinate] = None, + header_included: bool = False, ) -> ElementMetadata: """Returns metadata depending on `include_metadata` flag""" + excel_rc = _turn_coordinate_to_excel_format(rc, header_included=header_included) return ( ElementMetadata( page_name=sheet_name, page_number=page_number, filename=opts.metadata_file_path, last_modified=opts.last_modified, + data_source=DataSourceMetadata(record_locator=dict(rc=rc, excel_rc=excel_rc)), ) if opts.include_metadata else ElementMetadata() ) + + +def _turn_coordinate_to_excel_format( + rc: _CellCoordinate | None, + header_included: bool = False, +) -> None | str: + """ + Converts a tuple of row and column indices to Excel-like cell coordinate. + Takes into account that + - after Z it starts with AA, AB, ..., ZZ, AAA, ... + - if the header was included, the row index needs to be increased by 1 + """ + if rc is None: + return None + row, col = rc + + # Adjust column index to be 1-based + col += 1 + + col_letters = "" + while col > 0: + col, remainder = divmod(col - 1, 26) + col_letters = chr(remainder + 65) + col_letters + + # Adjust row index if header is included + if header_included: + row += 1 + + return f"{col_letters}{row}" From 6484e88b7fb5e031d0101201782a57f589105d29 Mon Sep 17 00:00:00 2001 From: "marc.torsoc" Date: Thu, 20 Jun 2024 13:34:51 +0200 Subject: [PATCH 2/5] fix starting_page_number --- unstructured/partition/xlsx.py | 47 +++++++++++++++------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 1b1575492c..30361ebb12 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -59,8 +59,6 @@ def partition_xlsx( include_header: bool = False, find_subtable: bool = True, date_from_file_object: bool = False, - # TODO marc: fix this too. This is not doing anything atm, just renaming the number - # but keeps starting at 1 starting_page_number: int = 1, **kwargs: Any, ) -> list[Element]: @@ -110,13 +108,10 @@ def partition_xlsx( ) elements: list[Element] = [] - for page_number, (sheet_name, sheet) in enumerate( - opts.sheets.items(), start=starting_page_number - ): - # TODO marc: remove - if page_number != 5: + # Excel counts sheets 1-based + for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1): + if page_number < starting_page_number: continue - if not opts.find_subtable: html_text = ( sheet.to_html( # pyright: ignore[reportUnknownMemberType] @@ -153,15 +148,15 @@ def partition_xlsx( else: for component in _ConnectedComponents.from_worksheet_df(sheet): subtable_parser = _SubtableParser(component) - c = subtable_parser._top_left_coordinate[1] + c = subtable_parser.top_left_coordinate[1] # -- emit each leading single-cell row as its own `Text`-subtype element -- for row_index, content in zip( - subtable_parser._leading_single_cell_row_indices, + subtable_parser.leading_single_cell_row_indices, subtable_parser.iter_leading_single_cell_rows_texts(), ): element = _create_element(str(content)) - r = subtable_parser._top_left_coordinate[0] + row_index + include_header + r = subtable_parser.top_left_coordinate[0] + row_index + include_header element.metadata = _get_metadata( sheet_name, page_number, @@ -185,7 +180,7 @@ def partition_xlsx( ) element = Table(text=text) r = ( - subtable_parser._top_left_coordinate[0] + subtable_parser.top_left_coordinate[0] + subtable_parser.core_table_start + include_header ) @@ -205,12 +200,12 @@ def partition_xlsx( # -- emit each trailing single-cell row as its own `Text`-subtype element -- for row_index, content in zip( - subtable_parser._trailing_single_cell_row_indices, + subtable_parser.trailing_single_cell_row_indices, subtable_parser.iter_trailing_single_cell_rows_texts(), ): element = _create_element(str(content)) r = ( - subtable_parser._top_left_coordinate[0] + subtable_parser.top_left_coordinate[0] + len(component.subtable) + row_index + include_header @@ -369,12 +364,12 @@ def merge(self, other: _ConnectedComponent) -> _ConnectedComponent: @lazyproperty def max_x(self) -> int: """The right-most column index of the connected component.""" - return self._extents[2] + return self.extents[2] @lazyproperty def min_x(self) -> int: """The left-most column index of the connected component.""" - return self._extents[0] + return self.extents[0] @lazyproperty def subtable(self) -> pd.DataFrame: @@ -383,11 +378,11 @@ def subtable(self) -> pd.DataFrame: The subtable is the rectangular region of the worksheet inside the connected-component bounding-box. Row-indices and column labels are preserved, not restarted at 0. """ - min_x, min_y, max_x, max_y = self._extents + min_x, min_y, max_x, max_y = self.extents return self._worksheet.iloc[min_x : max_x + 1, min_y : max_y + 1] @lazyproperty - def _extents(self) -> tuple[int, int, int, int]: + def extents(self) -> tuple[int, int, int, int]: """Compute bounding box of this connected component.""" min_x, min_y, max_x, max_y = float("inf"), float("inf"), float("-inf"), float("-inf") for x, y in self._cell_coordinate_set: @@ -498,12 +493,12 @@ class _SubtableParser: def __init__(self, component: _ConnectedComponent): self._subtable = component.subtable - self._top_left_coordinate = component._extents[0:2] + self.top_left_coordinate = component.extents[0:2] @lazyproperty def core_table_start(self) -> int: """The index of the first row in the core-table.""" - return len(self._leading_single_cell_row_indices) + return len(self.leading_single_cell_row_indices) @lazyproperty def core_table(self) -> pd.DataFrame | None: @@ -517,23 +512,23 @@ def core_table(self) -> pd.DataFrame | None: # -- assert: there is at least one core-table row (leading single-cell rows greedily # -- consumes all consecutive single-cell rows. - core_table_stop = len(self._subtable) - len(self._trailing_single_cell_row_indices) + core_table_stop = len(self._subtable) - len(self.trailing_single_cell_row_indices) # -- core-table is what's left in-between -- return self._subtable[self.core_table_start : core_table_stop] def iter_leading_single_cell_rows_texts(self) -> Iterator[str]: """Generate the cell-text for each leading single-cell row.""" - for row_idx in self._leading_single_cell_row_indices: + for row_idx in self.leading_single_cell_row_indices: yield self._subtable.iloc[row_idx].dropna().iloc[0] # pyright: ignore def iter_trailing_single_cell_rows_texts(self) -> Iterator[str]: """Generate the cell-text for each trailing single-cell row.""" - for row_idx in self._trailing_single_cell_row_indices: + for row_idx in self.trailing_single_cell_row_indices: yield self._subtable.iloc[row_idx].dropna().iloc[0] # pyright: ignore @lazyproperty - def _leading_single_cell_row_indices(self) -> tuple[int, ...]: + def leading_single_cell_row_indices(self) -> tuple[int, ...]: """Index of each leading single-cell row in subtable, in top-down order.""" def iter_leading_single_cell_row_indices() -> Iterator[int]: @@ -557,10 +552,10 @@ def iter_single_cell_row_idxs() -> Iterator[int]: return tuple(iter_single_cell_row_idxs()) @lazyproperty - def _trailing_single_cell_row_indices(self) -> tuple[int, ...]: + def trailing_single_cell_row_indices(self) -> tuple[int, ...]: """Index of each trailing single-cell row in subtable, in top-down order.""" # -- if all subtable rows are single-cell, then by convention they are all leading -- - if len(self._leading_single_cell_row_indices) == len(self._subtable): + if len(self.leading_single_cell_row_indices) == len(self._subtable): return () def iter_trailing_single_cell_row_indices() -> Iterator[int]: From 74e1816e789a1ffc7855698cf793aef795dded9c Mon Sep 17 00:00:00 2001 From: "marc.torsoc" Date: Thu, 20 Jun 2024 13:46:36 +0200 Subject: [PATCH 3/5] updated changelog and version --- CHANGELOG.md | 6 +++++- README.md | 2 ++ unstructured/__version__.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 616b3ef8f2..2f22615ddb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.14.7-dev5 +## 0.14.7-dev6 ### Enhancements @@ -8,12 +8,16 @@ ### Features * **Expose conversion functions for tables** Adds public functions to convert tables from HTML to the Deckerd format and back +* **Add row-column coordinate to partition-xlsx** Adds row-column coordinate to the metadata of the elements returned when partitioning an Excel file. It both returns the coordinate as a python tuple (and 0-based), and in Excel format (e.g. "A1"). +* **Change behaviour of `starting_page_number` in partition-xlsx** Makes this parameter skip any sheet before its value. Previously, it would name the first sheet as in this +value. Now the first sheet is always sheet 1. ### Fixes * **Fix an error publishing docker images.** Update user in docker-smoke-test to reflect changes made by the amd64 image pull from the "unstructured" "wolfi-base" image. * **Fix a IndexError when partitioning a pdf with values for both `extract_image_block_types` and `starting_page_number`. + ## 0.14.6 ### Enhancements diff --git a/README.md b/README.md index ca8d97c595..0b7548b7f6 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,8 @@ If using the optional `pre-commit`, you'll just need to install the hooks with ` `pre-commit` package is installed as part of `make install` mentioned above. Finally, if you decided to use `pre-commit` you can also uninstall the hooks with `pre-commit uninstall`. +Don't forget to increase the + In addition to develop in your local OS we also provide a helper to use docker providing a development environment: ```bash diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2e06b8d47a..bab830681b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.7-dev5" # pragma: no cover +__version__ = "0.14.7-dev6" # pragma: no cover From b4aaa2cbac5340988b429ecb426a13ab6f73b478 Mon Sep 17 00:00:00 2001 From: marcglobality Date: Wed, 31 Jul 2024 15:09:36 +0300 Subject: [PATCH 4/5] fix rc for leading and trailing cells --- unstructured/partition/xlsx.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 30361ebb12..d1dd7266d4 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -148,13 +148,16 @@ def partition_xlsx( else: for component in _ConnectedComponents.from_worksheet_df(sheet): subtable_parser = _SubtableParser(component) - c = subtable_parser.top_left_coordinate[1] # -- emit each leading single-cell row as its own `Text`-subtype element -- for row_index, content in zip( subtable_parser.leading_single_cell_row_indices, subtable_parser.iter_leading_single_cell_rows_texts(), ): + + # find column (usually it will be top_left_column but not necessarily) + c: int = subtable_parser._subtable.iloc[row_index].notna().idxmax() + element = _create_element(str(content)) r = subtable_parser.top_left_coordinate[0] + row_index + include_header element.metadata = _get_metadata( @@ -162,7 +165,6 @@ def partition_xlsx( page_number, opts, rc=(r, c), - header_included=include_header, ) elements.append(element) @@ -184,12 +186,12 @@ def partition_xlsx( + subtable_parser.core_table_start + include_header ) + c = subtable_parser.top_left_coordinate[1] element.metadata = _get_metadata( sheet_name, page_number, opts, rc=(r, c), - header_included=include_header, ) element.metadata.text_as_html = ( html_text if opts.infer_table_structure else None @@ -206,12 +208,15 @@ def partition_xlsx( element = _create_element(str(content)) r = ( subtable_parser.top_left_coordinate[0] - + len(component.subtable) + row_index + include_header ) + c: int = subtable_parser._subtable.iloc[row_index].notna().idxmax() element.metadata = _get_metadata( - sheet_name, page_number, opts, rc=(r, c), header_included=include_header + sheet_name, + page_number, + opts, + rc=(r, c), ) elements.append(element) @@ -589,10 +594,9 @@ def _get_metadata( page_number: int, opts: _XlsxPartitionerOptions, rc: Optional[_CellCoordinate] = None, - header_included: bool = False, ) -> ElementMetadata: """Returns metadata depending on `include_metadata` flag""" - excel_rc = _turn_coordinate_to_excel_format(rc, header_included=header_included) + excel_rc = _turn_coordinate_to_excel_format(rc) return ( ElementMetadata( page_name=sheet_name, @@ -606,10 +610,7 @@ def _get_metadata( ) -def _turn_coordinate_to_excel_format( - rc: _CellCoordinate | None, - header_included: bool = False, -) -> None | str: +def _turn_coordinate_to_excel_format(rc: _CellCoordinate | None) -> None | str: """ Converts a tuple of row and column indices to Excel-like cell coordinate. Takes into account that @@ -620,16 +621,13 @@ def _turn_coordinate_to_excel_format( return None row, col = rc - # Adjust column index to be 1-based + # Adjust row and column index to be 1-based col += 1 + row += 1 col_letters = "" while col > 0: col, remainder = divmod(col - 1, 26) col_letters = chr(remainder + 65) + col_letters - # Adjust row index if header is included - if header_included: - row += 1 - return f"{col_letters}{row}" From d1732bd602caa9a909fd28f14f7494f0882fea3d Mon Sep 17 00:00:00 2001 From: "marc.torsoc" Date: Fri, 2 Aug 2024 00:19:33 +0300 Subject: [PATCH 5/5] remove pending bit about starting_page_number --- CHANGELOG.md | 3 ++- README.md | 2 -- unstructured/partition/xlsx.py | 7 +++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89aedfb9a7..99deeb32a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,12 +100,13 @@ * **Expose conversion functions for tables** Adds public functions to convert tables from HTML to the Deckerd format and back +* **Adds Kafka Source and Destination** New source and destination connector added to all CLI ingest commands to support reading from and writing to Kafka streams. Also supports Confluent Kafka. + ### Fixes * **Fix an error publishing docker images.** Update user in docker-smoke-test to reflect changes made by the amd64 image pull from the "unstructured" "wolfi-base" image. * **Fix a IndexError when partitioning a pdf with values for both `extract_image_block_types` and `starting_page_number`. - ## 0.14.6 ### Enhancements diff --git a/README.md b/README.md index 7e7993b0db..f7f386fa05 100644 --- a/README.md +++ b/README.md @@ -166,8 +166,6 @@ If using the optional `pre-commit`, you'll just need to install the hooks with ` `pre-commit` package is installed as part of `make install` mentioned above. Finally, if you decided to use `pre-commit` you can also uninstall the hooks with `pre-commit uninstall`. -Don't forget to increase the - In addition to develop in your local OS we also provide a helper to use docker providing a development environment: ```bash diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index 5e376ebf95..ebb47781ae 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -109,10 +109,9 @@ def partition_xlsx( ) elements: list[Element] = [] - # Excel counts sheets 1-based - for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1): - if page_number < starting_page_number: - continue + for page_number, (sheet_name, sheet) in enumerate( + opts.sheets.items(), start=starting_page_number + ): if not opts.find_subtable: html_text = ( sheet.to_html( # pyright: ignore[reportUnknownMemberType]