From c09f2456bfdad9ff4665095639b6a7929296c696 Mon Sep 17 00:00:00 2001 From: HaythamBorchani Date: Wed, 20 Dec 2023 23:14:59 +0100 Subject: [PATCH] UP my work --- sklearn_questions.py | 74 ++++++++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 20 deletions(-) diff --git a/sklearn_questions.py b/sklearn_questions.py index fa02e0d..c25c374 100644 --- a/sklearn_questions.py +++ b/sklearn_questions.py @@ -17,11 +17,8 @@ repo `pytest test_sklearn_questions.py`. Note that to be fully valid, a scikit-learn estimator needs to check that the input given to `fit` and `predict` are correct using the `check_*` functions imported in the file. -You can find more information on how they should be used in the following doc: -https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator. Make sure to use them to pass `test_nearest_neighbor_check_estimator`. - Detailed instructions for question 2: The data to split should contain the index or one column in datatime format. Then the aim is to split the data between train and test @@ -59,6 +56,7 @@ from sklearn.utils.validation import check_array from sklearn.utils.multiclass import check_classification_targets from sklearn.metrics.pairwise import pairwise_distances +from collections import Counter class KNearestNeighbors(BaseEstimator, ClassifierMixin): @@ -73,15 +71,22 @@ def fit(self, X, y): Parameters ---------- X : ndarray, shape (n_samples, n_features) - Data to train the model. + training data. y : ndarray, shape (n_samples,) - Labels associated with the training data. + target values. Returns ---------- self : instance of KNearestNeighbors The current instance of the classifier """ + X, y = check_X_y(X, y) + check_classification_targets(y) + self.X_ = X + self.labels_ = y + self.classes_ = np.unique(y) + self.n_features_in_ = X.shape[1] + return self def predict(self, X): @@ -90,14 +95,23 @@ def predict(self, X): Parameters ---------- X : ndarray, shape (n_test_samples, n_features) - Data to predict on. + Test data to predict on. Returns ---------- y : ndarray, shape (n_test_samples,) - Predicted class labels for each test data sample. + Class labels for each test data sample. """ - y_pred = np.zeros(X.shape[0]) + + check_is_fitted(self) + X = check_array(X) + distances = pairwise_distances(self.X_, X) + indices = np.argsort(distances, axis=0)[:self.n_neighbors, :] + k_labels = self.labels_[indices] + k_common = [Counter(k_labels[:, k]).most_common(1)[0][0] + for k in range(X.shape[0])] + + y_pred = np.array(k_common) return y_pred def score(self, X, y): @@ -106,7 +120,7 @@ def score(self, X, y): Parameters ---------- X : ndarray, shape (n_samples, n_features) - Data to score on. + training data. y : ndarray, shape (n_samples,) target values. @@ -115,16 +129,16 @@ def score(self, X, y): score : float Accuracy of the model computed for the (X, y) pairs. """ - return 0. + check_classification_targets(y) + y_pred = self.predict(X) + return np.mean(y_pred == y) class MonthlySplit(BaseCrossValidator): """CrossValidator based on monthly split. - Split data based on the given `time_col` (or default to index). Each split corresponds to one month of data for the training and the next month of data for the test. - Parameters ---------- time_col : str, defaults to 'index' @@ -139,7 +153,6 @@ def __init__(self, time_col='index'): # noqa: D107 def get_n_splits(self, X, y=None, groups=None): """Return the number of splitting iterations in the cross-validator. - Parameters ---------- X : array-like of shape (n_samples, n_features) @@ -149,17 +162,20 @@ def get_n_splits(self, X, y=None, groups=None): Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. - Returns ------- n_splits : int The number of splits. """ - return 0 + if self.time_col != 'index': + time_datas = X[self.time_col].dt + else: + time_datas = X.index + + return time_datas.strftime('%Y-%m').nunique()-1 def split(self, X, y, groups=None): """Generate indices to split data into training and test set. - Parameters ---------- X : array-like of shape (n_samples, n_features) @@ -169,7 +185,6 @@ def split(self, X, y, groups=None): Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. - Yields ------ idx_train : ndarray @@ -177,12 +192,31 @@ def split(self, X, y, groups=None): idx_test : ndarray The testing set indices for that split. """ + X = pd.DataFrame(X) + if self.time_col != 'index': + if X[self.time_col].dtype != '