Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add bug fix for table metric #3025

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 34 additions & 8 deletions unstructured/metrics/table/table_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,14 @@ def get_table_level_alignment(

@staticmethod
def _zip_to_dataframe(table_data: List[Dict[str, Any]]) -> pd.DataFrame:
df = pd.DataFrame(table_data).pivot(
index="row_index",
columns="col_index",
values="content",
)
# df = pd.DataFrame(table_data).pivot(
# index="row_index",
# columns="col_index",
# values="content",
# )
df = pd.DataFrame(table_data, columns=["row_index", "col_index", "content"])
df = df.set_index("row_index")
df["col_index"] = df["col_index"].astype(str)
return df

@staticmethod
Expand Down Expand Up @@ -100,7 +103,7 @@ def get_element_level_alignment(

# Get row and col index accuracy
ground_truth_td_contents_list = [gtd["content"].lower() for gtd in ground_truth_td]

used_indices = set()
indices_tuple_pairs = []
for td_ele in td:
content = td_ele["content"].lower()
Expand All @@ -113,8 +116,31 @@ def get_element_level_alignment(
cutoff=cutoff,
n=1,
)
matched_idx = ground_truth_td_contents_list.index(matches[0]) if matches else -1

# BUG FIX: the previous matched_idx will only output the first matched index if the match has duplicates in the
# ground_truth_td_contents_list, my current fix will output its correspondence idx
# although once matching is exhausted, it will go back search again the same fashion
# matched_idx = ground_truth_td_contents_list.index(matches[0]) if matches else -1
matching_indices = []
if matches != []:
b_indices = [
i
for i, b_string in enumerate(ground_truth_td_contents_list)
if b_string == matches[0] and i not in used_indices
]
if not b_indices:
# If all indices are used, reset used_indices and use the first index
used_indices.clear()
b_indices = [
i
for i, b_string in enumerate(ground_truth_td_contents_list)
if b_string == matches[0] and i not in used_indices
]
matching_index = b_indices[0]
matching_indices.append(matching_index)
used_indices.add(matching_index)
else:
matching_indices = [-1]
matched_idx = matching_indices[0]
if matched_idx >= 0:
gt_row_index = ground_truth_td[matched_idx]["row_index"]
gt_col_index = ground_truth_td[matched_idx]["col_index"]
Expand Down
Loading