x-datascience-datacamp · SebX-7879 · Dec 21, 2023
diff --git a/sklearn_questions.py b/sklearn_questions.py
@@ -49,6 +49,7 @@
 """
 import numpy as np
 import pandas as pd
+from collections import Counter
 
 from sklearn.base import BaseEstimator
 from sklearn.base import ClassifierMixin
@@ -61,6 +62,20 @@
 from sklearn.metrics.pairwise import pairwise_distances
 
 
+def most_common_label(array):
+    """_summary_
+    Args:
+        array : the array of labels
+    Returns:
+        most_common : the most common label in the array
+    """
+    counter = Counter(array)
+    most_common = counter.most_common(1)
+    # If there is a tie for the most common label, this will return the first \
+    # one encountered.
+    return most_common[0][0] if most_common else None
+
+
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
     """KNearestNeighbors classifier."""
 
@@ -82,6 +97,12 @@ def fit(self, X, y):
         self : instance of KNearestNeighbors
             The current instance of the classifier
         """
+        X, y = check_X_y(X, y)
+        check_classification_targets(y)
+        self.X_ = X
+        self.y_ = y
+        self.n_features_in_ = X.shape[1]
+        self.classes_ = np.unique(y)
         return self
 
     def predict(self, X):
@@ -97,7 +118,12 @@ def predict(self, X):
         y : ndarray, shape (n_test_samples,)
             Predicted class labels for each test data sample.
         """
-        y_pred = np.zeros(X.shape[0])
+        check_is_fitted(self)
+        X = check_array(X)
+        closest = np.argsort(pairwise_distances(X, self.X_))[:,
+                                                             :self.n_neighbors]
+        y_pred = np.array([most_common_label(self.y_[closest[i]]) for i in
+                           range(X.shape[0])])
         return y_pred
 
     def score(self, X, y):
@@ -115,7 +141,10 @@ def score(self, X, y):
         score : float
             Accuracy of the model computed for the (X, y) pairs.
         """
-        return 0.
+        check_classification_targets(y)
+        y_pred = self.predict(X)
+        score = (y == y_pred).mean()
+        return score
 
 
 class MonthlySplit(BaseCrossValidator):
@@ -155,7 +184,14 @@ def get_n_splits(self, X, y=None, groups=None):
         n_splits : int
             The number of splits.
         """
-        return 0
+        if self.time_col == 'index':
+            unique_year_month = set([(date.year, date.month)
+                                     for date in X.index])
+        else:
+            unique_year_month = set([(date.year, date.month)
+                                     for date in X[self.time_col]])
+        n_splits = len(unique_year_month)-1
+        return n_splits
 
     def split(self, X, y, groups=None):
         """Generate indices to split data into training and test set.
@@ -177,12 +213,50 @@ def split(self, X, y, groups=None):
         idx_test : ndarray
             The testing set indices for that split.
         """
-
-        n_samples = X.shape[0]
+        if self.time_col == 'index':
+            if not isinstance(X.index, pd.DatetimeIndex):
+                raise ValueError("Index of the DataFrame is not datetime")
+            df = X.index.map(lambda date: (date.year, date.month))
+        else:
+            if not isinstance(X[self.time_col], pd.Series) or \
+                    X[self.time_col].dtype != 'datetime64[ns]':
+                raise ValueError("The splitting column is not a datetime")
+            df = X[self.time_col].apply(lambda date: (date.year, date.month))
+        unique_year_month = sorted(set(df))
         n_splits = self.get_n_splits(X, y, groups)
+        assert len(unique_year_month) == n_splits+1, "The number \
+        of unique year-month pairs must be equal to the n_split+1"
         for i in range(n_splits):
-            idx_train = range(n_samples)
-            idx_test = range(n_samples)
+            idx_train = [X.index.get_loc(elm)
+                         for elm in X[df == unique_year_month[i]]
+                         .index.tolist()]
+            idx_test = [X.index.get_loc(elm)
+                        for elm in X[df == unique_year_month[i+1]]
+                        .index.tolist()]
             yield (
                 idx_train, idx_test
             )
+        df = None
+
+
+def main():
+    # Create a DataFrame with a datetime column
+    date_range = pd.date_range(start='1/1/2020', end='1/1/2022', freq='D')
+    df = pd.DataFrame({
+        'value': np.random.rand(len(date_range))
+    }, index=date_range)
+    y = pd.DataFrame({
+        'label': np.random.randint(1, 4, df.shape[0])
+    }, index=date_range)
+    spliter = MonthlySplit()
+    # Use your generator function to get the training and test indices
+    # for each split
+    for i, (idx_train, idx_test) in enumerate(spliter.split(df, y)):
+        print(f"Training indices: {df.iloc[idx_train]}")
+        print(f"Test indices: {df.iloc[idx_test]}")
+        if i == 4:
+            break
+
+
+if __name__ == '__main__':
+    main()