Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merge request] bug fix on table structure metric #3089

Merged
merged 14 commits into from
Jun 10, 2024
Merged
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

### Fixes

**table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.

## 0.14.5

### Enhancements
Expand Down
1 change: 1 addition & 0 deletions unstructured/metrics/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def _try_process_document(self, doc: Path) -> Optional[list]:
@abstractmethod
def _process_document(self, doc: Path) -> list:
"""Should return all metadata and metrics for a single document."""
pass


@dataclass
Expand Down
37 changes: 29 additions & 8 deletions unstructured/metrics/table/table_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,9 @@ def get_table_level_alignment(

@staticmethod
def _zip_to_dataframe(table_data: List[Dict[str, Any]]) -> pd.DataFrame:
df = pd.DataFrame(table_data).pivot(
index="row_index",
columns="col_index",
values="content",
)
df = pd.DataFrame(table_data, columns=["row_index", "col_index", "content"])
df = df.set_index("row_index")
df["col_index"] = df["col_index"].astype(str)
return df

@staticmethod
Expand Down Expand Up @@ -100,7 +98,7 @@ def get_element_level_alignment(

# Get row and col index accuracy
ground_truth_td_contents_list = [gtd["content"].lower() for gtd in ground_truth_td]

used_indices = set()
indices_tuple_pairs = []
for td_ele in td:
content = td_ele["content"].lower()
Expand All @@ -113,8 +111,31 @@ def get_element_level_alignment(
cutoff=cutoff,
n=1,
)
matched_idx = ground_truth_td_contents_list.index(matches[0]) if matches else -1

# BUG FIX: the previous matched_idx will only output the first matched index if
# the match has duplicates in the
# ground_truth_td_contents_list, the current fix will output its correspondence idx
# once matching is exhausted, it will go back search again the same fashion
matching_indices = []
if matches != []:
b_indices = [
i
for i, b_string in enumerate(ground_truth_td_contents_list)
if b_string == matches[0] and i not in used_indices
]
if not b_indices:
# If all indices are used, reset used_indices and use the first index
used_indices.clear()
b_indices = [
i
for i, b_string in enumerate(ground_truth_td_contents_list)
if b_string == matches[0] and i not in used_indices
]
matching_index = b_indices[0]
matching_indices.append(matching_index)
used_indices.add(matching_index)
else:
matching_indices = [-1]
matched_idx = matching_indices[0]
if matched_idx >= 0:
gt_row_index = ground_truth_td[matched_idx]["row_index"]
gt_col_index = ground_truth_td[matched_idx]["col_index"]
Expand Down
Loading