From 7c877b84e8ba9213fb11d7ef6d7b603e81fc5828 Mon Sep 17 00:00:00 2001
From: tbs17 <jia.t.shen@gmail.com>
Date: Wed, 15 May 2024 13:02:44 -0400
Subject: [PATCH] add bug fix for table metric

---
 unstructured/metrics/evaluate.py              |  1 -
 unstructured/metrics/table/table_alignment.py | 42 +++++++++++++++----
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py
index a6d97962cf..4ea44a237a 100755
--- a/unstructured/metrics/evaluate.py
+++ b/unstructured/metrics/evaluate.py
@@ -160,7 +160,6 @@ def _try_process_document(self, doc: Path) -> Optional[list]:
     @abstractmethod
     def _process_document(self, doc: Path) -> list:
         """Should return all metadata and metrics for a single document."""
-        pass
 
 
 @dataclass
diff --git a/unstructured/metrics/table/table_alignment.py b/unstructured/metrics/table/table_alignment.py
index 23d5d00a8f..d663247268 100644
--- a/unstructured/metrics/table/table_alignment.py
+++ b/unstructured/metrics/table/table_alignment.py
@@ -50,11 +50,14 @@ def get_table_level_alignment(
 
     @staticmethod
     def _zip_to_dataframe(table_data: List[Dict[str, Any]]) -> pd.DataFrame:
-        df = pd.DataFrame(table_data).pivot(
-            index="row_index",
-            columns="col_index",
-            values="content",
-        )
+        # df = pd.DataFrame(table_data).pivot(
+        #     index="row_index",
+        #     columns="col_index",
+        #     values="content",
+        # )
+        df = pd.DataFrame(table_data, columns=["row_index", "col_index", "content"])
+        df = df.set_index("row_index")
+        df["col_index"] = df["col_index"].astype(str)
         return df
 
     @staticmethod
@@ -100,7 +103,7 @@ def get_element_level_alignment(
 
             # Get row and col index accuracy
             ground_truth_td_contents_list = [gtd["content"].lower() for gtd in ground_truth_td]
-
+            used_indices = set()
             indices_tuple_pairs = []
             for td_ele in td:
                 content = td_ele["content"].lower()
@@ -113,8 +116,31 @@ def get_element_level_alignment(
                     cutoff=cutoff,
                     n=1,
                 )
-                matched_idx = ground_truth_td_contents_list.index(matches[0]) if matches else -1
-
+                # BUG FIX: the previous matched_idx will only output the first matched index if the match has duplicates in the
+                # ground_truth_td_contents_list, my current fix will output its correspondence idx
+                # although once matching is exhausted, it will go back search again the same fashion
+                # matched_idx = ground_truth_td_contents_list.index(matches[0]) if matches else -1
+                matching_indices = []
+                if matches != []:
+                    b_indices = [
+                        i
+                        for i, b_string in enumerate(ground_truth_td_contents_list)
+                        if b_string == matches[0] and i not in used_indices
+                    ]
+                    if not b_indices:
+                        # If all indices are used, reset used_indices and use the first index
+                        used_indices.clear()
+                        b_indices = [
+                            i
+                            for i, b_string in enumerate(ground_truth_td_contents_list)
+                            if b_string == matches[0] and i not in used_indices
+                        ]
+                    matching_index = b_indices[0]
+                    matching_indices.append(matching_index)
+                    used_indices.add(matching_index)
+                else:
+                    matching_indices = [-1]
+                matched_idx = matching_indices[0]
                 if matched_idx >= 0:
                     gt_row_index = ground_truth_td[matched_idx]["row_index"]
                     gt_col_index = ground_truth_td[matched_idx]["col_index"]