From 7c786afe6902799ca39ffca6e94e254568c27165 Mon Sep 17 00:00:00 2001 From: sami bh Date: Fri, 22 Dec 2023 18:27:27 +0100 Subject: [PATCH] UP my solution --- sklearn_questions.py | 61 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..cf9e575 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -49,6 +49,7 @@ """ import numpy as np import pandas as pd +from pandas.api.types import is_datetime64_any_dtype from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin @@ -82,6 +83,12 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ + X, y = check_X_y(X, y) + check_classification_targets(y) + self.n_features_in_ = X.shape[1] + self.classes_ = np.unique(y) + self.X_ = X + self.y_ = y return self def predict(self, X): @@ -97,8 +104,21 @@ def predict(self, X): y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) - return y_pred + check_is_fitted(self) + X = check_array(X) + distances = pairwise_distances(X, self.X_, metric="euclidean") + closest = np.argsort(distances, axis=1)[:, : self.n_neighbors] + classes = self.y_[closest] + y_pred = [] + for row in range(classes.shape[0]): + _, idx, counts = np.unique( + classes[row], return_index=True, return_counts=True + ) + index = idx[np.argmax(counts)] + prediction = classes[row][index] + y_pred.append(prediction) + + return np.array(y_pred) def score(self, X, y): """Calculate the score of the prediction. @@ -115,7 +135,8 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + y_pred = self.predict(X) + return np.mean(y_pred == y) class MonthlySplit(BaseCrossValidator): @@ -134,7 +155,7 @@ class MonthlySplit(BaseCrossValidator): To use the index as column just set `time_col` to `'index'`. """ - def __init__(self, time_col='index'): # noqa: D107 + def __init__(self, time_col="index"): # noqa: D107 self.time_col = time_col def get_n_splits(self, X, y=None, groups=None): @@ -155,7 +176,21 @@ def get_n_splits(self, X, y=None, groups=None): n_splits : int The number of splits. """ - return 0 + _X = pd.DataFrame(X) + if "index" not in _X.columns: + _X = _X.reset_index(names="index") + if not is_datetime64_any_dtype(_X[self.time_col]): + raise ValueError( + f"Column {self.time_col} is not a datetime column." + ) + max_date = _X[self.time_col].max() + min_date = _X[self.time_col].min() + res = ( + 12 * (max_date.year - min_date.year) + + max_date.month + - min_date.month + ) + return res def split(self, X, y, groups=None): """Generate indices to split data into training and test set. @@ -178,11 +213,13 @@ def split(self, X, y, groups=None): The testing set indices for that split. """ - n_samples = X.shape[0] - n_splits = self.get_n_splits(X, y, groups) + X_copy = X.reset_index() + n_splits = self.get_n_splits(X_copy, y, groups) + X_grouped = X_copy.sort_values(by=self.time_col).groupby( + pd.Grouper(key=self.time_col, freq="M") + ) + idxs = [group.index for _, group in X_grouped] for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) - yield ( - idx_train, idx_test - ) + idx_train = list(idxs[i]) + idx_test = list(idxs[i + 1]) + yield (idx_train, idx_test)