From d82a34519e54af0cb6290ba09502425f30509423 Mon Sep 17 00:00:00 2001 From: Tracy Shen <34946571+tbs17@users.noreply.github.com> Date: Mon, 10 Jun 2024 11:14:32 -0400 Subject: [PATCH] [Merge request] bug fix on table structure metric (#3089) **Summary** This fix is to provide better logic oon matched_idx of calculating table structure metric to provide more accurate calculation on the acc **Additional Context** - this fix has passed CI run in Draft PR #3025 initially - therefore, this time we would like to merge into main branch - this commit has merged the latest change from main after the Draft PR --- CHANGELOG.md | 2 + unstructured/metrics/evaluate.py | 1 + unstructured/metrics/table/table_alignment.py | 37 +++++++++++++++---- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8248ee26bc..9ced61c28c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ### Fixes +**table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string. + ## 0.14.5 ### Enhancements diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index 4ea44a237a..a6d97962cf 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -160,6 +160,7 @@ def _try_process_document(self, doc: Path) -> Optional[list]: @abstractmethod def _process_document(self, doc: Path) -> list: """Should return all metadata and metrics for a single document.""" + pass @dataclass diff --git a/unstructured/metrics/table/table_alignment.py b/unstructured/metrics/table/table_alignment.py index 23d5d00a8f..35acc2625a 100644 --- a/unstructured/metrics/table/table_alignment.py +++ b/unstructured/metrics/table/table_alignment.py @@ -50,11 +50,9 @@ def get_table_level_alignment( @staticmethod def _zip_to_dataframe(table_data: List[Dict[str, Any]]) -> pd.DataFrame: - df = pd.DataFrame(table_data).pivot( - index="row_index", - columns="col_index", - values="content", - ) + df = pd.DataFrame(table_data, columns=["row_index", "col_index", "content"]) + df = df.set_index("row_index") + df["col_index"] = df["col_index"].astype(str) return df @staticmethod @@ -100,7 +98,7 @@ def get_element_level_alignment( # Get row and col index accuracy ground_truth_td_contents_list = [gtd["content"].lower() for gtd in ground_truth_td] - + used_indices = set() indices_tuple_pairs = [] for td_ele in td: content = td_ele["content"].lower() @@ -113,8 +111,31 @@ def get_element_level_alignment( cutoff=cutoff, n=1, ) - matched_idx = ground_truth_td_contents_list.index(matches[0]) if matches else -1 - + # BUG FIX: the previous matched_idx will only output the first matched index if + # the match has duplicates in the + # ground_truth_td_contents_list, the current fix will output its correspondence idx + # once matching is exhausted, it will go back search again the same fashion + matching_indices = [] + if matches != []: + b_indices = [ + i + for i, b_string in enumerate(ground_truth_td_contents_list) + if b_string == matches[0] and i not in used_indices + ] + if not b_indices: + # If all indices are used, reset used_indices and use the first index + used_indices.clear() + b_indices = [ + i + for i, b_string in enumerate(ground_truth_td_contents_list) + if b_string == matches[0] and i not in used_indices + ] + matching_index = b_indices[0] + matching_indices.append(matching_index) + used_indices.add(matching_index) + else: + matching_indices = [-1] + matched_idx = matching_indices[0] if matched_idx >= 0: gt_row_index = ground_truth_td[matched_idx]["row_index"] gt_col_index = ground_truth_td[matched_idx]["col_index"]