From ac16a648aaad3a339bdd38bb4ad93d72bf741bdd Mon Sep 17 00:00:00 2001 From: Aymane-Rahmoune Date: Wed, 20 Dec 2023 20:34:44 +0100 Subject: [PATCH] UP my solution --- sklearn_questions.py | 72 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 9 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..74bde44 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -49,9 +49,11 @@ """ import numpy as np import pandas as pd +# import datetime as date from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin +from sklearn.metrics import accuracy_score from sklearn.model_selection import BaseCrossValidator @@ -82,6 +84,13 @@ def fit(self, X, y): self : instance of KNearestNeighbors The current instance of the classifier """ + X, y = check_X_y(X, y) + check_classification_targets(y) + self.classes_ = np.unique(y) + self.X_ = X + self.y_ = y + self.n_features_in_ = X.shape[1] + return self def predict(self, X): @@ -97,7 +106,25 @@ def predict(self, X): y : ndarray, shape (n_test_samples,) Predicted class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) + check_is_fitted(self) + X = check_array(X) + + # calcul distances + distances = pairwise_distances(X, self.X_) + + # select k points les plus proches + closest = np.argsort(distances, axis=1)[:, : self.n_neighbors] + closest = self.y_[closest] + + # choix du y le plus frequent + y_pred = np.apply_along_axis( + lambda x: np.unique(x, return_counts=True)[0][ + np.argmax(np.unique(x, return_counts=True)[1]) + ], + axis=1, + arr=closest, + ) + return y_pred def score(self, X, y): @@ -115,7 +142,9 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + predictions = self.predict(X) + + return accuracy_score(y, predictions) class MonthlySplit(BaseCrossValidator): @@ -155,7 +184,15 @@ def get_n_splits(self, X, y=None, groups=None): n_splits : int The number of splits. """ - return 0 + X = X.reset_index() + + if X[self.time_col].dtype != "datetime64[ns]": + raise ValueError("datetime") + column_date = X[self.time_col] + max = column_date.max() + min = column_date.min() + + return (max.year - min.year) * 12 + max.month - min.month def split(self, X, y, groups=None): """Generate indices to split data into training and test set. @@ -178,11 +215,28 @@ def split(self, X, y, groups=None): The testing set indices for that split. """ - n_samples = X.shape[0] n_splits = self.get_n_splits(X, y, groups) - for i in range(n_splits): - idx_train = range(n_samples) - idx_test = range(n_samples) - yield ( - idx_train, idx_test + + X = X.reset_index() + X.index.names = ["Index_nb"] + X = X.reset_index() + X["Month"] = pd.DatetimeIndex(X[self.time_col]).month + X["Year"] = pd.DatetimeIndex(X[self.time_col]).year + X_ = X.copy() + X_ = X_[["Month", "Year"]].drop_duplicates().sort_values( + ["Year", "Month"] ) + for i in range(n_splits): + idx_train = X.merge( + X_.iloc[[i]], + how="inner", + left_on=["Month", "Year"], + right_on=["Month", "Year"], + )["Index_nb"].to_numpy() + idx_test = X.merge( + X_.iloc[[i + 1]], + how="inner", + left_on=["Month", "Year"], + right_on=["Month", "Year"], + )["Index_nb"].to_numpy() + yield (idx_train, idx_test)