Skip to content

Commit

Permalink
Improve/ds gold docs tests (#110)
Browse files Browse the repository at this point in the history
* tests: add ds gold proba correction test

* tests: add label test

* add: docs & true_labels param in other fit functions
  • Loading branch information
shenxiangzhuang authored Jul 30, 2024
1 parent 089b81e commit c0c5b92
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 8 deletions.
49 changes: 41 additions & 8 deletions crowdkit/aggregation/classification/dawid_skene.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ class DawidSkene(BaseClassificationAggregator):
>>> df, gt = load_dataset('relevance-2')
>>> ds = DawidSkene(100)
>>> result = ds.fit_predict(df)
We can use the golden labels to correct the probability distributions of task labels
by the true labels during the iterative process:
Examples:
>>> from crowdkit.aggregation import DawidSkene
>>> from crowdkit.datasets import load_dataset
>>> df, gt = load_dataset('relevance-2')
>>> true_labels = gt[:1000] # use the first 100 true labels
>>> ds = DawidSkene(100)
>>> result = ds.fit_predict(df, true_labels)
"""

n_iter: int = attr.ib(default=100)
Expand Down Expand Up @@ -96,12 +107,13 @@ def _e_step(
Estimates the true task label probabilities using the specified workers' responses,
the prior label probabilities, and the workers' error probability matrix.
"""

# We have to multiply lots of probabilities and such products are known to converge
# to zero exponentially fast. To avoid floating-point precision problems we work with
# logs of original values
joined = data.join(np.log2(errors), on=["worker", "label"]) # type: ignore
joined.drop(columns=["worker", "label"], inplace=True)

priors.clip(lower=_EPS, inplace=True)
log_likelihoods = np.log2(priors) + joined.groupby("task", sort=False).sum()
log_likelihoods.rename_axis("label", axis=1, inplace=True)

Expand Down Expand Up @@ -135,26 +147,31 @@ def _evidence_lower_bound(
# escape boolean index/column names to prevent confusion between indexing by boolean array and iterable of names
joined = joined.rename(columns={True: "True", False: "False"}, copy=False)
priors = priors.rename(index={True: "True", False: "False"}, copy=False)
priors.clip(lower=_EPS, inplace=True)

joined.loc[:, priors.index] = joined.loc[:, priors.index].add(np.log(priors)) # type: ignore

joined.set_index(["task", "worker"], inplace=True)
joint_expectation = (
(probas.rename(columns={True: "True", False: "False"}) * joined).sum().sum()
)

probas.clip(lower=_EPS, inplace=True)
entropy = -(np.log(probas) * probas).sum().sum()
return float(joint_expectation + entropy)

def fit(
self, data: pd.DataFrame, true_labels: Optional["pd.Series[Any]"] = None
) -> "DawidSkene":
"""Fits the model to the training data with the EM algorithm.
Args:
data (DataFrame): The training dataset of workers' labeling results
which is represented as the `pandas.DataFrame` data containing `task`, `worker`, and `label` columns.
true_labels (Series): The ground truth labels of tasks. The `pandas.Series` data is indexed by `task`
so that `labels.loc[task]` is the task ground truth label.
true_labels (Series): The ground truth labels of tasks.
The `pandas.Series` data is indexed by `task` so that `labels.loc[task]` is the task ground truth label.
When provided, the model will correct the probability distributions of task labels by the true labels
during the iterative process.
Returns:
DawidSkene: self.
"""
Expand Down Expand Up @@ -207,31 +224,47 @@ def fit(

return self

def fit_predict_proba(self, data: pd.DataFrame) -> pd.DataFrame:
def fit_predict_proba(
self, data: pd.DataFrame, true_labels: Optional["pd.Series[Any]"] = None
) -> pd.DataFrame:
"""Fits the model to the training data and returns probability distributions of labels for each task.
Args:
data (DataFrame): The training dataset of workers' labeling results
which is represented as the `pandas.DataFrame` data containing `task`, `worker`, and `label` columns.
true_labels (Series): The ground truth labels of tasks.
The `pandas.Series` data is indexed by `task` so that `labels.loc[task]` is the task ground truth label.
When provided, the model will correct the probability distributions of task labels by the true labels
during the iterative process.
Returns:
DataFrame: Probability distributions of task labels.
The `pandas.DataFrame` data is indexed by `task` so that `result.loc[task, label]` is the probability that the `task` true label is equal to `label`.
Each probability is in the range from 0 to 1, all task probabilities must sum up to 1.
"""

self.fit(data)
self.fit(data, true_labels)
assert self.probas_ is not None, "no probas_"
return self.probas_

def fit_predict(self, data: pd.DataFrame) -> "pd.Series[Any]":
def fit_predict(
self, data: pd.DataFrame, true_labels: Optional["pd.Series[Any]"] = None
) -> "pd.Series[Any]":
"""Fits the model to the training data and returns the aggregated results.
Args:
data (DataFrame): The training dataset of workers' labeling results
which is represented as the `pandas.DataFrame` data containing `task`, `worker`, and `label` columns.
true_labels (Series): The ground truth labels of tasks.
The `pandas.Series` data is indexed by `task` so that `labels.loc[task]` is the task ground truth label.
When provided, the model will correct the probability distributions of task labels by the true labels
during the iterative process.
Returns:
Series: Task labels. The `pandas.Series` data is indexed by `task` so that `labels.loc[task]` is the most likely true label of tasks.
"""

self.fit(data)
self.fit(data, true_labels)
assert self.labels_ is not None, "no labels_"
return self.labels_

Expand Down
25 changes: 25 additions & 0 deletions tests/aggregation/test_ds_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,31 @@ def test_aggregate_ds_gold_on_toy_ysda(
)


@pytest.mark.parametrize("n_iter", [0, 1, 2])
def test_ds_gold_probas_correction_with_iters(
n_iter: int,
toy_answers_df: pd.DataFrame,
toy_ground_truth_df: "pd.Series[Any]",
toy_gold_df: "pd.Series[Any]",
) -> None:
ds = DawidSkene(n_iter).fit(toy_answers_df, toy_gold_df)
probas = ds.probas_
assert probas is not None, "no probas_"
probas = probas.merge(
toy_gold_df.rename("true_label"), left_on="task", right_index=True
)
# check that gold label probas are correct, i.e. equal to 1.0
match_count = probas.apply(
lambda row: np.isclose(row[row.true_label], 1.0, atol=1e-8), axis=1
).sum()
assert match_count == len(toy_gold_df), f"{match_count=}, {len(toy_gold_df)=}"
# check that all probas sum to 1(check that all probas are correct)
assert np.allclose(probas.drop("true_label", axis=1).sum(axis=1), 1.0, atol=1e-8)
# check labels
assert ds.labels_ is not None, "no labels_"
assert_series_equal(ds.labels_[toy_gold_df.index], toy_gold_df, check_names=False)


@pytest.mark.parametrize("n_iter, tol", [(10, 0), (100500, 1e-5)])
def test_aggregate_ds_on_toy_ysda(
n_iter: int,
Expand Down

0 comments on commit c0c5b92

Please sign in to comment.