x-datascience-datacamp · matthieu-bricaire · Dec 19, 2023
diff --git a/sklearn_questions.py b/sklearn_questions.py
@@ -48,7 +48,6 @@
 to compute distances between 2 sets of samples.
 """
 import numpy as np
-import pandas as pd
 
 from sklearn.base import BaseEstimator
 from sklearn.base import ClassifierMixin
@@ -57,9 +56,12 @@
 
 from sklearn.utils.validation import check_X_y, check_is_fitted
 from sklearn.utils.validation import check_array
-from sklearn.utils.multiclass import check_classification_targets
+from sklearn.utils.multiclass import (check_classification_targets,
+                                      unique_labels)
 from sklearn.metrics.pairwise import pairwise_distances
 
+from collections import Counter
+
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
     """KNearestNeighbors classifier."""
@@ -82,6 +84,21 @@ def fit(self, X, y):
         self : instance of KNearestNeighbors
             The current instance of the classifier
         """
+        # Check consistency between X and y
+        X, y = check_X_y(X, y)
+
+        # Check y type
+        check_classification_targets(y=y)
+
+        # Store training data and classes
+        self.X_train_ = X
+        self.y_train_ = y
+        self.classes_ = unique_labels(y)
+
+        self.n_samples_fit_ = self.X_train_.shape[0]
+        self.n_features_in_ = self.X_train_.shape[1]
+        self._is_fitted = True
+
         return self
 
     def predict(self, X):
@@ -97,8 +114,15 @@ def predict(self, X):
         y : ndarray, shape (n_test_samples,)
             Predicted class labels for each test data sample.
         """
-        y_pred = np.zeros(X.shape[0])
-        return y_pred
+        check_is_fitted(self)
+        X = check_array(X)
+
+        dists = pairwise_distances(X=X, Y=self.X_train_)
+        min_indices = np.argsort(dists, axis=1)[:, :self.n_neighbors]
+        preds = self.y_train_[min_indices]
+        y_pred = [max(Counter(row), key=Counter(row).get) for row in preds]
+
+        return np.array(y_pred)
 
     def score(self, X, y):
         """Calculate the score of the prediction.
@@ -115,7 +139,13 @@ def score(self, X, y):
         score : float
             Accuracy of the model computed for the (X, y) pairs.
         """
-        return 0.
+        check_is_fitted(self)
+        X = check_array(X)
+        check_classification_targets(y=y)
+
+        preds = self.predict(X)
+
+        return (preds == y).sum() / preds.shape[0]
 
 
 class MonthlySplit(BaseCrossValidator):
@@ -155,7 +185,19 @@ def get_n_splits(self, X, y=None, groups=None):
         n_splits : int
             The number of splits.
         """
-        return 0
+        X = X.reset_index()
+        dates = X[self.time_col]
+
+        try:
+            year_months = dates.map(lambda x: (x.year, x.month))
+        except Exception as e:
+            raise ValueError(
+                f"time_col column should be of type datetime - {e}"
+            )
+
+        n_months = len(year_months.unique())
+
+        return n_months - 1
 
     def split(self, X, y, groups=None):
         """Generate indices to split data into training and test set.
@@ -177,12 +219,22 @@ def split(self, X, y, groups=None):
         idx_test : ndarray
             The testing set indices for that split.
         """
-
-        n_samples = X.shape[0]
         n_splits = self.get_n_splits(X, y, groups)
+        X = X.reset_index()
+        dates = X[self.time_col]
+        year_months = dates.map(lambda x: (x.year, x.month))
+        months = np.sort(year_months.unique())
+
         for i in range(n_splits):
-            idx_train = range(n_samples)
-            idx_test = range(n_samples)
+            year, month = months[i]
+            idx_train = X[
+                (dates.dt.year == year) & (dates.dt.month == month)
+                ].index.to_numpy()
+            year, month = months[i + 1]
+            idx_test = X[
+                (dates.dt.year == year) & (dates.dt.month == month)
+                ].index.to_numpy()
+
             yield (
                 idx_train, idx_test
             )