Third assignment

x-datascience-datacamp · Dec 21, 2023 · 37f4df5 · 37f4df5
1 parent 6ccb1be
commit 37f4df5
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 11 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
diff --git a/sklearn_questions.py b/sklearn_questions.py
@@ -59,6 +59,7 @@
 from sklearn.utils.validation import check_array
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.utils.multiclass import unique_labels
 
 
 class KNearestNeighbors(BaseEstimator, ClassifierMixin):
@@ -82,6 +83,21 @@ def fit(self, X, y):
         self : instance of KNearestNeighbors
             The current instance of the classifier
         """
+        X, y = check_X_y(X, y)
+        check_classification_targets(y)
+        self.n_features_in_ = X.shape[1]
+        self.classes_ = unique_labels(y)
+        if len(self.classes_) < 2:
+            raise ValueError("You need more than one class.")
+        self.X_ = X
+        self.y_ = y
+
+        if self.n_neighbors > len(self.X_):
+            raise ValueError(
+                "Number of neighbors cannot be greater \
+                the number of training samples."
+                )
+
         return self
 
     def predict(self, X):
@@ -97,7 +113,15 @@ def predict(self, X):
         y : ndarray, shape (n_test_samples,)
             Predicted class labels for each test data sample.
         """
-        y_pred = np.zeros(X.shape[0])
+        check_is_fitted(self)
+        X = check_array(X)
+        y_pred = []
+        for i, x in enumerate(X):
+            distances = pairwise_distances(x.reshape(1, -1), self.X_)
+            idx = np.argsort(distances, axis=1)[0][:self.n_neighbors]
+            values, counts = np.unique(self.y_[idx], return_counts=True)
+            y_pred.append(values[np.argmax(counts)])
+        y_pred = np.array(y_pred)
         return y_pred
 
     def score(self, X, y):
@@ -115,7 +139,10 @@ def score(self, X, y):
         score : float
             Accuracy of the model computed for the (X, y) pairs.
         """
-        return 0.
+        check_is_fitted(self)
+        X = check_array(X)
+        y_pred = self.predict(X)
+        return np.mean(y == y_pred)
 
 
 class MonthlySplit(BaseCrossValidator):
@@ -155,7 +182,15 @@ def get_n_splits(self, X, y=None, groups=None):
         n_splits : int
             The number of splits.
         """
-        return 0
+        X_copy = X.reset_index() if self.time_col == 'index' else X.copy()
+        if X_copy[self.time_col].dtype != 'datetime64[ns]':
+            raise ValueError(
+                f"The column '{self.time_col}' is not a datetime."
+                )
+        X_copy.sort_values(by=self.time_col, inplace=True)
+        month_changes = X_copy[self.time_col].dt.month.diff().ne(0)
+        n_splits = month_changes.sum() - 1
+        return n_splits
 
     def split(self, X, y, groups=None):
         """Generate indices to split data into training and test set.
@@ -177,12 +212,15 @@ def split(self, X, y, groups=None):
         idx_test : ndarray
             The testing set indices for that split.
         """
-
-        n_samples = X.shape[0]
-        n_splits = self.get_n_splits(X, y, groups)
+        X_copy = X.reset_index()
+        n_splits = self.get_n_splits(X_copy, y, groups)
+        X_grouped = X_copy.sort_values(
+            by=self.time_col
+            ).groupby(
+                pd.Grouper(key=self.time_col, freq="M")
+                )
+        idxs = [group.index for _, group in X_grouped]
         for i in range(n_splits):
-            idx_train = range(n_samples)
-            idx_test = range(n_samples)
-            yield (
-                idx_train, idx_test
-            )
+            idx_train = idxs[i].tolist()
+            idx_test = idxs[i+1].tolist()
+            yield idx_train, idx_test