UP my work

x-datascience-datacamp · Dec 20, 2023 · c09f245 · c09f245
1 parent 6ccb1be
commit c09f245
Showing 1 changed file with 54 additions and 20 deletions.
diff --git a/sklearn_questions.py b/sklearn_questions.py
@@ -17,11 +17,8 @@
 repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a
 scikit-learn estimator needs to check that the input given to `fit` and
 `predict` are correct using the `check_*` functions imported in the file.
-You can find more information on how they should be used in the following doc:
-https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator.
 Make sure to use them to pass `test_nearest_neighbor_check_estimator`.
 
-
 Detailed instructions for question 2:
 The data to split should contain the index or one column in
 datatime format. Then the aim is to split the data between train and test
@@ -59,6 +56,7 @@
 from sklearn.utils.validation import check_array
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.metrics.pairwise import pairwise_distances
+from collections import Counter
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
@@ -73,15 +71,22 @@ def fit(self, X, y):
          Parameters
         ----------
         X : ndarray, shape (n_samples, n_features)
-            Data to train the model.
+            training data.
         y : ndarray, shape (n_samples,)
-            Labels associated with the training data.
+            target values.
 
         Returns
         ----------
         self : instance of KNearestNeighbors
             The current instance of the classifier
         """
+        X, y = check_X_y(X, y)
+        check_classification_targets(y)
+        self.X_ = X
+        self.labels_ = y
+        self.classes_ = np.unique(y)
+        self.n_features_in_ = X.shape[1]
+
         return self
 
     def predict(self, X):
@@ -90,14 +95,23 @@ def predict(self, X):
         Parameters
         ----------
         X : ndarray, shape (n_test_samples, n_features)
-            Data to predict on.
+            Test data to predict on.
 
         Returns
         ----------
         y : ndarray, shape (n_test_samples,)
-            Predicted class labels for each test data sample.
+            Class labels for each test data sample.
         """
-        y_pred = np.zeros(X.shape[0])
+
+        check_is_fitted(self)
+        X = check_array(X)
+        distances = pairwise_distances(self.X_, X)
+        indices = np.argsort(distances, axis=0)[:self.n_neighbors, :]
+        k_labels = self.labels_[indices]
+        k_common = [Counter(k_labels[:, k]).most_common(1)[0][0]
+                    for k in range(X.shape[0])]
+
+        y_pred = np.array(k_common)
         return y_pred
 
     def score(self, X, y):
@@ -106,7 +120,7 @@ def score(self, X, y):
         Parameters
         ----------
         X : ndarray, shape (n_samples, n_features)
-            Data to score on.
+            training data.
         y : ndarray, shape (n_samples,)
             target values.
 
@@ -115,16 +129,16 @@ def score(self, X, y):
         score : float
             Accuracy of the model computed for the (X, y) pairs.
         """
-        return 0.
+        check_classification_targets(y)
+        y_pred = self.predict(X)
+        return np.mean(y_pred == y)
 
 
 class MonthlySplit(BaseCrossValidator):
     """CrossValidator based on monthly split.
-
     Split data based on the given `time_col` (or default to index). Each split
     corresponds to one month of data for the training and the next month of
     data for the test.
-
     Parameters
     ----------
     time_col : str, defaults to 'index'
@@ -139,7 +153,6 @@ def __init__(self, time_col='index'):  # noqa: D107
 
     def get_n_splits(self, X, y=None, groups=None):
         """Return the number of splitting iterations in the cross-validator.
-
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
@@ -149,17 +162,20 @@ def get_n_splits(self, X, y=None, groups=None):
             Always ignored, exists for compatibility.
         groups : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
-
         Returns
         -------
         n_splits : int
             The number of splits.
         """
-        return 0
+        if self.time_col != 'index':
+            time_datas = X[self.time_col].dt
+        else:
+            time_datas = X.index
+
+        return time_datas.strftime('%Y-%m').nunique()-1
 
     def split(self, X, y, groups=None):
         """Generate indices to split data into training and test set.
-
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
@@ -169,20 +185,38 @@ def split(self, X, y, groups=None):
             Always ignored, exists for compatibility.
         groups : array-like of shape (n_samples,)
             Always ignored, exists for compatibility.
-
         Yields
         ------
         idx_train : ndarray
             The training set indices for that split.
         idx_test : ndarray
             The testing set indices for that split.
         """
+        X = pd.DataFrame(X)
+        if self.time_col != 'index':
+            if X[self.time_col].dtype != '<M8[ns]':
+                raise ValueError('datetime')
+        else:
+            if X.index.dtype != '<M8[ns]':
+                raise ValueError('datetime')
 
-        n_samples = X.shape[0]
         n_splits = self.get_n_splits(X, y, groups)
+
+        if self.time_col != 'index':
+            time_datas = X[self.time_col].dt
+        else:
+            time_datas = X.index
+
+        months = np.sort(np.unique(time_datas.strftime('%Y-%m')))
+        X['indices'] = np.arange(len(X))
+        if self.time_col != 'index':
+            X_bis = X.reset_index().set_index(self.time_col).sort_index()
+        else:
+            X_bis = X.sort_index()
+
         for i in range(n_splits):
-            idx_train = range(n_samples)
-            idx_test = range(n_samples)
+            idx_train = X_bis[months[i]:months[i]]['indices'].values
+            idx_test = X_bis[months[i+1]:months[i+1]]['indices'].values
             yield (
                 idx_train, idx_test
             )