From 4e1681c80092e8ee756b4b7070cd012bc4efa5aa Mon Sep 17 00:00:00 2001
From: "marc.torsoc" <marc.torsoc@gmail.com>
Date: Thu, 20 Jun 2024 13:14:21 +0200
Subject: [PATCH 1/5] add rc to partition excel + format

---
 unstructured/partition/xlsx.py | 124 ++++++++++++++++++++++++++++-----
 1 file changed, 105 insertions(+), 19 deletions(-)

diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
index f219ab9741..1b1575492c 100644
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@@ -15,6 +15,7 @@
 from unstructured.chunking import add_chunking_strategy
 from unstructured.cleaners.core import clean_bullets
 from unstructured.documents.elements import (
+    DataSourceMetadata,
     Element,
     ElementMetadata,
     ListItem,
@@ -25,7 +26,10 @@
     process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
-from unstructured.partition.common import get_last_modified_date, get_last_modified_date_from_file
+from unstructured.partition.common import (
+    get_last_modified_date,
+    get_last_modified_date_from_file,
+)
 from unstructured.partition.lang import apply_lang_metadata
 from unstructured.partition.text_type import (
     is_bulleted_text,
@@ -55,6 +59,8 @@ def partition_xlsx(
     include_header: bool = False,
     find_subtable: bool = True,
     date_from_file_object: bool = False,
+    # TODO marc: fix this too. This is not doing anything atm, just renaming the number
+    # but keeps starting at 1
     starting_page_number: int = 1,
     **kwargs: Any,
 ) -> list[Element]:
@@ -107,6 +113,10 @@ def partition_xlsx(
     for page_number, (sheet_name, sheet) in enumerate(
         opts.sheets.items(), start=starting_page_number
     ):
+        # TODO marc: remove
+        if page_number != 5:
+            continue
+
         if not opts.find_subtable:
             html_text = (
                 sheet.to_html(  # pyright: ignore[reportUnknownMemberType]
@@ -130,6 +140,9 @@ def partition_xlsx(
                     page_number=page_number,
                     filename=opts.metadata_file_path,
                     last_modified=opts.last_modified,
+                    data_source=DataSourceMetadata(
+                        record_locator=dict(rc=(0, 0), excel_rc="A1"),
+                    ),
                 )
                 metadata.detection_origin = DETECTION_ORIGIN
             else:
@@ -139,12 +152,23 @@ def partition_xlsx(
             elements.append(table)
         else:
             for component in _ConnectedComponents.from_worksheet_df(sheet):
-                subtable_parser = _SubtableParser(component.subtable)
+                subtable_parser = _SubtableParser(component)
+                c = subtable_parser._top_left_coordinate[1]
 
                 # -- emit each leading single-cell row as its own `Text`-subtype element --
-                for content in subtable_parser.iter_leading_single_cell_rows_texts():
+                for row_index, content in zip(
+                    subtable_parser._leading_single_cell_row_indices,
+                    subtable_parser.iter_leading_single_cell_rows_texts(),
+                ):
                     element = _create_element(str(content))
-                    element.metadata = _get_metadata(sheet_name, page_number, opts)
+                    r = subtable_parser._top_left_coordinate[0] + row_index + include_header
+                    element.metadata = _get_metadata(
+                        sheet_name,
+                        page_number,
+                        opts,
+                        rc=(r, c),
+                        header_included=include_header,
+                    )
                     elements.append(element)
 
                 # -- emit core-table (if it exists) as a `Table` element --
@@ -160,7 +184,18 @@ def partition_xlsx(
                         ).text_content(),
                     )
                     element = Table(text=text)
-                    element.metadata = _get_metadata(sheet_name, page_number, opts)
+                    r = (
+                        subtable_parser._top_left_coordinate[0]
+                        + subtable_parser.core_table_start
+                        + include_header
+                    )
+                    element.metadata = _get_metadata(
+                        sheet_name,
+                        page_number,
+                        opts,
+                        rc=(r, c),
+                        header_included=include_header,
+                    )
                     element.metadata.text_as_html = (
                         html_text if opts.infer_table_structure else None
                     )
@@ -169,9 +204,20 @@ def partition_xlsx(
                 # -- no core-table is emitted if it's empty (all rows are single-cell rows) --
 
                 # -- emit each trailing single-cell row as its own `Text`-subtype element --
-                for content in subtable_parser.iter_trailing_single_cell_rows_texts():
+                for row_index, content in zip(
+                    subtable_parser._trailing_single_cell_row_indices,
+                    subtable_parser.iter_trailing_single_cell_rows_texts(),
+                ):
                     element = _create_element(str(content))
-                    element.metadata = _get_metadata(sheet_name, page_number, opts)
+                    r = (
+                        subtable_parser._top_left_coordinate[0]
+                        + len(component.subtable)
+                        + row_index
+                        + include_header
+                    )
+                    element.metadata = _get_metadata(
+                        sheet_name, page_number, opts, rc=(r, c), header_included=include_header
+                    )
                     elements.append(element)
 
     elements = list(
@@ -310,21 +356,21 @@ def __init__(self, worksheet: pd.DataFrame, cell_coordinate_set: set[_CellCoordi
         self._worksheet = worksheet
         self._cell_coordinate_set = cell_coordinate_set
 
-    @lazyproperty
-    def max_x(self) -> int:
-        """The right-most column index of the connected component."""
-        return self._extents[2]
-
     def merge(self, other: _ConnectedComponent) -> _ConnectedComponent:
         """Produce new instance with union of cells in `self` and `other`.
 
-        Used to combine regions of workshet that are "overlapping" row-wise but not actually
+        Used to combine regions of worksheet that are "overlapping" row-wise but not actually
         2D-connected.
         """
         return _ConnectedComponent(
             self._worksheet, self._cell_coordinate_set.union(other._cell_coordinate_set)
         )
 
+    @lazyproperty
+    def max_x(self) -> int:
+        """The right-most column index of the connected component."""
+        return self._extents[2]
+
     @lazyproperty
     def min_x(self) -> int:
         """The left-most column index of the connected component."""
@@ -450,17 +496,22 @@ class _SubtableParser:
     element.
     """
 
-    def __init__(self, subtable: pd.DataFrame):
-        self._subtable = subtable
+    def __init__(self, component: _ConnectedComponent):
+        self._subtable = component.subtable
+        self._top_left_coordinate = component._extents[0:2]
+
+    @lazyproperty
+    def core_table_start(self) -> int:
+        """The index of the first row in the core-table."""
+        return len(self._leading_single_cell_row_indices)
 
     @lazyproperty
     def core_table(self) -> pd.DataFrame | None:
         """The part between the leading and trailing single-cell rows, if any."""
-        core_table_start = len(self._leading_single_cell_row_indices)
 
         # -- if core-table start is the end of table, there is no core-table
         # -- (all rows are single-cell)
-        if core_table_start == len(self._subtable):
+        if self.core_table_start == len(self._subtable):
             return None
 
         # -- assert: there is at least one core-table row (leading single-cell rows greedily
@@ -469,7 +520,7 @@ def core_table(self) -> pd.DataFrame | None:
         core_table_stop = len(self._subtable) - len(self._trailing_single_cell_row_indices)
 
         # -- core-table is what's left in-between --
-        return self._subtable[core_table_start:core_table_stop]
+        return self._subtable[self.core_table_start : core_table_stop]
 
     def iter_leading_single_cell_rows_texts(self) -> Iterator[str]:
         """Generate the cell-text for each leading single-cell row."""
@@ -539,16 +590,51 @@ def _create_element(text: str) -> Element:
 
 
 def _get_metadata(
-    sheet_name: str, page_number: int, opts: _XlsxPartitionerOptions
+    sheet_name: str,
+    page_number: int,
+    opts: _XlsxPartitionerOptions,
+    rc: Optional[_CellCoordinate] = None,
+    header_included: bool = False,
 ) -> ElementMetadata:
     """Returns metadata depending on `include_metadata` flag"""
+    excel_rc = _turn_coordinate_to_excel_format(rc, header_included=header_included)
     return (
         ElementMetadata(
             page_name=sheet_name,
             page_number=page_number,
             filename=opts.metadata_file_path,
             last_modified=opts.last_modified,
+            data_source=DataSourceMetadata(record_locator=dict(rc=rc, excel_rc=excel_rc)),
         )
         if opts.include_metadata
         else ElementMetadata()
     )
+
+
+def _turn_coordinate_to_excel_format(
+    rc: _CellCoordinate | None,
+    header_included: bool = False,
+) -> None | str:
+    """
+    Converts a tuple of row and column indices to Excel-like cell coordinate.
+    Takes into account that
+    - after Z it starts with AA, AB, ..., ZZ, AAA, ...
+    - if the header was included, the row index needs to be increased by 1
+    """
+    if rc is None:
+        return None
+    row, col = rc
+
+    # Adjust column index to be 1-based
+    col += 1
+
+    col_letters = ""
+    while col > 0:
+        col, remainder = divmod(col - 1, 26)
+        col_letters = chr(remainder + 65) + col_letters
+
+    # Adjust row index if header is included
+    if header_included:
+        row += 1
+
+    return f"{col_letters}{row}"

From 6484e88b7fb5e031d0101201782a57f589105d29 Mon Sep 17 00:00:00 2001
From: "marc.torsoc" <marc.torsoc@gmail.com>
Date: Thu, 20 Jun 2024 13:34:51 +0200
Subject: [PATCH 2/5] fix starting_page_number

---
 unstructured/partition/xlsx.py | 47 +++++++++++++++-------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
index 1b1575492c..30361ebb12 100644
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@@ -59,8 +59,6 @@ def partition_xlsx(
     include_header: bool = False,
     find_subtable: bool = True,
     date_from_file_object: bool = False,
-    # TODO marc: fix this too. This is not doing anything atm, just renaming the number
-    # but keeps starting at 1
     starting_page_number: int = 1,
     **kwargs: Any,
 ) -> list[Element]:
@@ -110,13 +108,10 @@ def partition_xlsx(
     )
 
     elements: list[Element] = []
-    for page_number, (sheet_name, sheet) in enumerate(
-        opts.sheets.items(), start=starting_page_number
-    ):
-        # TODO marc: remove
-        if page_number != 5:
+    # Excel counts sheets 1-based
+    for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1):
+        if page_number < starting_page_number:
             continue
-
         if not opts.find_subtable:
             html_text = (
                 sheet.to_html(  # pyright: ignore[reportUnknownMemberType]
@@ -153,15 +148,15 @@ def partition_xlsx(
         else:
             for component in _ConnectedComponents.from_worksheet_df(sheet):
                 subtable_parser = _SubtableParser(component)
-                c = subtable_parser._top_left_coordinate[1]
+                c = subtable_parser.top_left_coordinate[1]
 
                 # -- emit each leading single-cell row as its own `Text`-subtype element --
                 for row_index, content in zip(
-                    subtable_parser._leading_single_cell_row_indices,
+                    subtable_parser.leading_single_cell_row_indices,
                     subtable_parser.iter_leading_single_cell_rows_texts(),
                 ):
                     element = _create_element(str(content))
-                    r = subtable_parser._top_left_coordinate[0] + row_index + include_header
+                    r = subtable_parser.top_left_coordinate[0] + row_index + include_header
                     element.metadata = _get_metadata(
                         sheet_name,
                         page_number,
@@ -185,7 +180,7 @@ def partition_xlsx(
                     )
                     element = Table(text=text)
                     r = (
-                        subtable_parser._top_left_coordinate[0]
+                        subtable_parser.top_left_coordinate[0]
                         + subtable_parser.core_table_start
                         + include_header
                     )
@@ -205,12 +200,12 @@ def partition_xlsx(
 
                 # -- emit each trailing single-cell row as its own `Text`-subtype element --
                 for row_index, content in zip(
-                    subtable_parser._trailing_single_cell_row_indices,
+                    subtable_parser.trailing_single_cell_row_indices,
                     subtable_parser.iter_trailing_single_cell_rows_texts(),
                 ):
                     element = _create_element(str(content))
                     r = (
-                        subtable_parser._top_left_coordinate[0]
+                        subtable_parser.top_left_coordinate[0]
                         + len(component.subtable)
                         + row_index
                         + include_header
@@ -369,12 +364,12 @@ def merge(self, other: _ConnectedComponent) -> _ConnectedComponent:
     @lazyproperty
     def max_x(self) -> int:
         """The right-most column index of the connected component."""
-        return self._extents[2]
+        return self.extents[2]
 
     @lazyproperty
     def min_x(self) -> int:
         """The left-most column index of the connected component."""
-        return self._extents[0]
+        return self.extents[0]
 
     @lazyproperty
     def subtable(self) -> pd.DataFrame:
@@ -383,11 +378,11 @@ def subtable(self) -> pd.DataFrame:
         The subtable is the rectangular region of the worksheet inside the connected-component
         bounding-box. Row-indices and column labels are preserved, not restarted at 0.
         """
-        min_x, min_y, max_x, max_y = self._extents
+        min_x, min_y, max_x, max_y = self.extents
         return self._worksheet.iloc[min_x : max_x + 1, min_y : max_y + 1]
 
     @lazyproperty
-    def _extents(self) -> tuple[int, int, int, int]:
+    def extents(self) -> tuple[int, int, int, int]:
         """Compute bounding box of this connected component."""
         min_x, min_y, max_x, max_y = float("inf"), float("inf"), float("-inf"), float("-inf")
         for x, y in self._cell_coordinate_set:
@@ -498,12 +493,12 @@ class _SubtableParser:
 
     def __init__(self, component: _ConnectedComponent):
         self._subtable = component.subtable
-        self._top_left_coordinate = component._extents[0:2]
+        self.top_left_coordinate = component.extents[0:2]
 
     @lazyproperty
     def core_table_start(self) -> int:
         """The index of the first row in the core-table."""
-        return len(self._leading_single_cell_row_indices)
+        return len(self.leading_single_cell_row_indices)
 
     @lazyproperty
     def core_table(self) -> pd.DataFrame | None:
@@ -517,23 +512,23 @@ def core_table(self) -> pd.DataFrame | None:
         # -- assert: there is at least one core-table row (leading single-cell rows greedily
         # -- consumes all consecutive single-cell rows.
 
-        core_table_stop = len(self._subtable) - len(self._trailing_single_cell_row_indices)
+        core_table_stop = len(self._subtable) - len(self.trailing_single_cell_row_indices)
 
         # -- core-table is what's left in-between --
         return self._subtable[self.core_table_start : core_table_stop]
 
     def iter_leading_single_cell_rows_texts(self) -> Iterator[str]:
         """Generate the cell-text for each leading single-cell row."""
-        for row_idx in self._leading_single_cell_row_indices:
+        for row_idx in self.leading_single_cell_row_indices:
             yield self._subtable.iloc[row_idx].dropna().iloc[0]  # pyright: ignore
 
     def iter_trailing_single_cell_rows_texts(self) -> Iterator[str]:
         """Generate the cell-text for each trailing single-cell row."""
-        for row_idx in self._trailing_single_cell_row_indices:
+        for row_idx in self.trailing_single_cell_row_indices:
             yield self._subtable.iloc[row_idx].dropna().iloc[0]  # pyright: ignore
 
     @lazyproperty
-    def _leading_single_cell_row_indices(self) -> tuple[int, ...]:
+    def leading_single_cell_row_indices(self) -> tuple[int, ...]:
         """Index of each leading single-cell row in subtable, in top-down order."""
 
         def iter_leading_single_cell_row_indices() -> Iterator[int]:
@@ -557,10 +552,10 @@ def iter_single_cell_row_idxs() -> Iterator[int]:
         return tuple(iter_single_cell_row_idxs())
 
     @lazyproperty
-    def _trailing_single_cell_row_indices(self) -> tuple[int, ...]:
+    def trailing_single_cell_row_indices(self) -> tuple[int, ...]:
         """Index of each trailing single-cell row in subtable, in top-down order."""
         # -- if all subtable rows are single-cell, then by convention they are all leading --
-        if len(self._leading_single_cell_row_indices) == len(self._subtable):
+        if len(self.leading_single_cell_row_indices) == len(self._subtable):
             return ()
 
         def iter_trailing_single_cell_row_indices() -> Iterator[int]:

From 74e1816e789a1ffc7855698cf793aef795dded9c Mon Sep 17 00:00:00 2001
From: "marc.torsoc" <marc.torsoc@gmail.com>
Date: Thu, 20 Jun 2024 13:46:36 +0200
Subject: [PATCH 3/5] updated changelog and version

---
 CHANGELOG.md                | 6 +++++-
 README.md                   | 2 ++
 unstructured/__version__.py | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 616b3ef8f2..2f22615ddb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.14.7-dev5
+## 0.14.7-dev6
 
 ### Enhancements
 
@@ -8,12 +8,16 @@
 ### Features
 
 * **Expose conversion functions for tables** Adds public functions to convert tables from HTML to the Deckerd format and back
+* **Add row-column coordinate to partition-xlsx** Adds row-column coordinate to the metadata of the elements returned when partitioning an Excel file. It both returns the coordinate as a python tuple (and 0-based), and in Excel format (e.g. "A1").
+* **Change behaviour of `starting_page_number` in partition-xlsx** Makes this parameter skip any sheet before its value. Previously, it would name the first sheet as in this
+value. Now the first sheet is always sheet 1.
 
 ### Fixes
 
 * **Fix an error publishing docker images.** Update user in docker-smoke-test to reflect changes made by the amd64 image pull from the "unstructured" "wolfi-base" image.
 * **Fix a IndexError when partitioning a pdf with values for both `extract_image_block_types` and `starting_page_number`.
 
+
 ## 0.14.6
 
 ### Enhancements
diff --git a/README.md b/README.md
index ca8d97c595..0b7548b7f6 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,8 @@ If using the optional `pre-commit`, you'll just need to install the hooks with `
 `pre-commit` package is installed as part of `make install` mentioned above. Finally, if you decided to use `pre-commit`
 you can also uninstall the hooks with `pre-commit uninstall`.
 
+Don't forget to increase the 
+
 In addition to develop in your local OS we also provide a helper to use docker providing a development environment:
 
 ```bash
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 2e06b8d47a..bab830681b 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.7-dev5"  # pragma: no cover
+__version__ = "0.14.7-dev6"  # pragma: no cover

From b4aaa2cbac5340988b429ecb426a13ab6f73b478 Mon Sep 17 00:00:00 2001
From: marcglobality <marc.torrellas-socastro@globality.com>
Date: Wed, 31 Jul 2024 15:09:36 +0300
Subject: [PATCH 4/5] fix rc for leading and trailing cells

---
 unstructured/partition/xlsx.py | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
index 30361ebb12..d1dd7266d4 100644
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@@ -148,13 +148,16 @@ def partition_xlsx(
         else:
             for component in _ConnectedComponents.from_worksheet_df(sheet):
                 subtable_parser = _SubtableParser(component)
-                c = subtable_parser.top_left_coordinate[1]
 
                 # -- emit each leading single-cell row as its own `Text`-subtype element --
                 for row_index, content in zip(
                     subtable_parser.leading_single_cell_row_indices,
                     subtable_parser.iter_leading_single_cell_rows_texts(),
                 ):
+
+                    # find column (usually it will be top_left_column but not necessarily)
+                    c: int = subtable_parser._subtable.iloc[row_index].notna().idxmax()
+
                     element = _create_element(str(content))
                     r = subtable_parser.top_left_coordinate[0] + row_index + include_header
                     element.metadata = _get_metadata(
@@ -162,7 +165,6 @@ def partition_xlsx(
                         page_number,
                         opts,
                         rc=(r, c),
-                        header_included=include_header,
                     )
                     elements.append(element)
 
@@ -184,12 +186,12 @@ def partition_xlsx(
                         + subtable_parser.core_table_start
                         + include_header
                     )
+                    c = subtable_parser.top_left_coordinate[1]
                     element.metadata = _get_metadata(
                         sheet_name,
                         page_number,
                         opts,
                         rc=(r, c),
-                        header_included=include_header,
                     )
                     element.metadata.text_as_html = (
                         html_text if opts.infer_table_structure else None
@@ -206,12 +208,15 @@ def partition_xlsx(
                     element = _create_element(str(content))
                     r = (
                         subtable_parser.top_left_coordinate[0]
-                        + len(component.subtable)
                         + row_index
                         + include_header
                     )
+                    c: int = subtable_parser._subtable.iloc[row_index].notna().idxmax()
                     element.metadata = _get_metadata(
-                        sheet_name, page_number, opts, rc=(r, c), header_included=include_header
+                        sheet_name,
+                        page_number,
+                        opts,
+                        rc=(r, c),
                     )
                     elements.append(element)
 
@@ -589,10 +594,9 @@ def _get_metadata(
     page_number: int,
     opts: _XlsxPartitionerOptions,
     rc: Optional[_CellCoordinate] = None,
-    header_included: bool = False,
 ) -> ElementMetadata:
     """Returns metadata depending on `include_metadata` flag"""
-    excel_rc = _turn_coordinate_to_excel_format(rc, header_included=header_included)
+    excel_rc = _turn_coordinate_to_excel_format(rc)
     return (
         ElementMetadata(
             page_name=sheet_name,
@@ -606,10 +610,7 @@ def _get_metadata(
     )
 
 
-def _turn_coordinate_to_excel_format(
-    rc: _CellCoordinate | None,
-    header_included: bool = False,
-) -> None | str:
+def _turn_coordinate_to_excel_format(rc: _CellCoordinate | None) -> None | str:
     """
     Converts a tuple of row and column indices to Excel-like cell coordinate.
     Takes into account that
@@ -620,16 +621,13 @@ def _turn_coordinate_to_excel_format(
         return None
     row, col = rc
 
-    # Adjust column index to be 1-based
+    # Adjust row and column index to be 1-based
     col += 1
+    row += 1
 
     col_letters = ""
     while col > 0:
         col, remainder = divmod(col - 1, 26)
         col_letters = chr(remainder + 65) + col_letters
 
-    # Adjust row index if header is included
-    if header_included:
-        row += 1
-
     return f"{col_letters}{row}"

From d1732bd602caa9a909fd28f14f7494f0882fea3d Mon Sep 17 00:00:00 2001
From: "marc.torsoc" <marc.torsoc@gmail.com>
Date: Fri, 2 Aug 2024 00:19:33 +0300
Subject: [PATCH 5/5] remove pending bit about starting_page_number

---
 CHANGELOG.md                   | 3 ++-
 README.md                      | 2 --
 unstructured/partition/xlsx.py | 7 +++----
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 89aedfb9a7..99deeb32a5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -100,12 +100,13 @@
 
 * **Expose conversion functions for tables** Adds public functions to convert tables from HTML to the Deckerd format and back
 
+* **Adds Kafka Source and Destination** New source and destination connector added to all CLI ingest commands to support reading from and writing to Kafka streams. Also supports Confluent Kafka.
+
 ### Fixes
 
 * **Fix an error publishing docker images.** Update user in docker-smoke-test to reflect changes made by the amd64 image pull from the "unstructured" "wolfi-base" image.
 * **Fix a IndexError when partitioning a pdf with values for both `extract_image_block_types` and `starting_page_number`.
 
-
 ## 0.14.6
 
 ### Enhancements
diff --git a/README.md b/README.md
index 7e7993b0db..f7f386fa05 100644
--- a/README.md
+++ b/README.md
@@ -166,8 +166,6 @@ If using the optional `pre-commit`, you'll just need to install the hooks with `
 `pre-commit` package is installed as part of `make install` mentioned above. Finally, if you decided to use `pre-commit`
 you can also uninstall the hooks with `pre-commit uninstall`.
 
-Don't forget to increase the 
-
 In addition to develop in your local OS we also provide a helper to use docker providing a development environment:
 
 ```bash
diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
index 5e376ebf95..ebb47781ae 100644
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@@ -109,10 +109,9 @@ def partition_xlsx(
     )
 
     elements: list[Element] = []
-    # Excel counts sheets 1-based
-    for page_number, (sheet_name, sheet) in enumerate(opts.sheets.items(), start=1):
-        if page_number < starting_page_number:
-            continue
+    for page_number, (sheet_name, sheet) in enumerate(
+        opts.sheets.items(), start=starting_page_number
+    ):
         if not opts.find_subtable:
             html_text = (
                 sheet.to_html(  # pyright: ignore[reportUnknownMemberType]