Improve/ds gold docs tests (#110)

* tests: add ds gold proba correction test * tests: add label test * add: docs & true_labels param in other fit functions
Toloka · Jul 30, 2024 · c0c5b92 · c0c5b92
1 parent 089b81e
commit c0c5b92
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 8 deletions.
diff --git a/crowdkit/aggregation/classification/dawid_skene.py b/crowdkit/aggregation/classification/dawid_skene.py
@@ -44,6 +44,17 @@ class DawidSkene(BaseClassificationAggregator):
         >>> df, gt = load_dataset('relevance-2')
         >>> ds = DawidSkene(100)
         >>> result = ds.fit_predict(df)
+
+    We can use the golden labels to correct the probability distributions of task labels
+    by the true labels during the iterative process:
+
+    Examples:
+        >>> from crowdkit.aggregation import DawidSkene
+        >>> from crowdkit.datasets import load_dataset
+        >>> df, gt = load_dataset('relevance-2')
+        >>> true_labels = gt[:1000]  # use the first 100 true labels
+        >>> ds = DawidSkene(100)
+        >>> result = ds.fit_predict(df, true_labels)
     """
 
     n_iter: int = attr.ib(default=100)
@@ -96,12 +107,13 @@ def _e_step(
         Estimates the true task label probabilities using the specified workers' responses,
         the prior label probabilities, and the workers' error probability matrix.
         """
-
         # We have to multiply lots of probabilities and such products are known to converge
         # to zero exponentially fast. To avoid floating-point precision problems we work with
         # logs of original values
         joined = data.join(np.log2(errors), on=["worker", "label"])  # type: ignore
         joined.drop(columns=["worker", "label"], inplace=True)
+
+        priors.clip(lower=_EPS, inplace=True)
         log_likelihoods = np.log2(priors) + joined.groupby("task", sort=False).sum()
         log_likelihoods.rename_axis("label", axis=1, inplace=True)
 
@@ -135,26 +147,31 @@ def _evidence_lower_bound(
         # escape boolean index/column names to prevent confusion between indexing by boolean array and iterable of names
         joined = joined.rename(columns={True: "True", False: "False"}, copy=False)
         priors = priors.rename(index={True: "True", False: "False"}, copy=False)
+        priors.clip(lower=_EPS, inplace=True)
 
         joined.loc[:, priors.index] = joined.loc[:, priors.index].add(np.log(priors))  # type: ignore
 
         joined.set_index(["task", "worker"], inplace=True)
         joint_expectation = (
             (probas.rename(columns={True: "True", False: "False"}) * joined).sum().sum()
         )
-
+        probas.clip(lower=_EPS, inplace=True)
         entropy = -(np.log(probas) * probas).sum().sum()
         return float(joint_expectation + entropy)
 
     def fit(
         self, data: pd.DataFrame, true_labels: Optional["pd.Series[Any]"] = None
     ) -> "DawidSkene":
         """Fits the model to the training data with the EM algorithm.
+
         Args:
             data (DataFrame): The training dataset of workers' labeling results
                 which is represented as the `pandas.DataFrame` data containing `task`, `worker`, and `label` columns.
-            true_labels (Series): The ground truth labels of tasks. The `pandas.Series` data is indexed by `task`
-                        so that `labels.loc[task]` is the task ground truth label.
+            true_labels (Series): The ground truth labels of tasks.
+                The `pandas.Series` data is indexed by `task`  so that `labels.loc[task]` is the task ground truth label.
+                When provided, the model will correct the probability distributions of task labels by the true labels
+                during the iterative process.
+
         Returns:
             DawidSkene: self.
         """
@@ -207,31 +224,47 @@ def fit(
 
         return self
 
-    def fit_predict_proba(self, data: pd.DataFrame) -> pd.DataFrame:
+    def fit_predict_proba(
+        self, data: pd.DataFrame, true_labels: Optional["pd.Series[Any]"] = None
+    ) -> pd.DataFrame:
         """Fits the model to the training data and returns probability distributions of labels for each task.
+
         Args:
             data (DataFrame): The training dataset of workers' labeling results
                 which is represented as the `pandas.DataFrame` data containing `task`, `worker`, and `label` columns.
+            true_labels (Series): The ground truth labels of tasks.
+                The `pandas.Series` data is indexed by `task`  so that `labels.loc[task]` is the task ground truth label.
+                When provided, the model will correct the probability distributions of task labels by the true labels
+                during the iterative process.
+
         Returns:
             DataFrame: Probability distributions of task labels.
                 The `pandas.DataFrame` data is indexed by `task` so that `result.loc[task, label]` is the probability that the `task` true label is equal to `label`.
                 Each probability is in the range from 0 to 1, all task probabilities must sum up to 1.
         """
 
-        self.fit(data)
+        self.fit(data, true_labels)
         assert self.probas_ is not None, "no probas_"
         return self.probas_
 
-    def fit_predict(self, data: pd.DataFrame) -> "pd.Series[Any]":
+    def fit_predict(
+        self, data: pd.DataFrame, true_labels: Optional["pd.Series[Any]"] = None
+    ) -> "pd.Series[Any]":
         """Fits the model to the training data and returns the aggregated results.
+
         Args:
             data (DataFrame): The training dataset of workers' labeling results
                 which is represented as the `pandas.DataFrame` data containing `task`, `worker`, and `label` columns.
+            true_labels (Series): The ground truth labels of tasks.
+                The `pandas.Series` data is indexed by `task`  so that `labels.loc[task]` is the task ground truth label.
+                When provided, the model will correct the probability distributions of task labels by the true labels
+                during the iterative process.
+
         Returns:
             Series: Task labels. The `pandas.Series` data is indexed by `task` so that `labels.loc[task]` is the most likely true label of tasks.
         """
 
-        self.fit(data)
+        self.fit(data, true_labels)
         assert self.labels_ is not None, "no labels_"
         return self.labels_
 

diff --git a/tests/aggregation/test_ds_aggregation.py b/tests/aggregation/test_ds_aggregation.py
@@ -28,6 +28,31 @@ def test_aggregate_ds_gold_on_toy_ysda(
     )
 
 
+@pytest.mark.parametrize("n_iter", [0, 1, 2])
+def test_ds_gold_probas_correction_with_iters(
+    n_iter: int,
+    toy_answers_df: pd.DataFrame,
+    toy_ground_truth_df: "pd.Series[Any]",
+    toy_gold_df: "pd.Series[Any]",
+) -> None:
+    ds = DawidSkene(n_iter).fit(toy_answers_df, toy_gold_df)
+    probas = ds.probas_
+    assert probas is not None, "no probas_"
+    probas = probas.merge(
+        toy_gold_df.rename("true_label"), left_on="task", right_index=True
+    )
+    # check that gold label probas are correct, i.e. equal to 1.0
+    match_count = probas.apply(
+        lambda row: np.isclose(row[row.true_label], 1.0, atol=1e-8), axis=1
+    ).sum()
+    assert match_count == len(toy_gold_df), f"{match_count=}, {len(toy_gold_df)=}"
+    # check that all probas sum to 1(check that all probas are correct)
+    assert np.allclose(probas.drop("true_label", axis=1).sum(axis=1), 1.0, atol=1e-8)
+    # check labels
+    assert ds.labels_ is not None, "no labels_"
+    assert_series_equal(ds.labels_[toy_gold_df.index], toy_gold_df, check_names=False)
+
+
 @pytest.mark.parametrize("n_iter, tol", [(10, 0), (100500, 1e-5)])
 def test_aggregate_ds_on_toy_ysda(
     n_iter: int,