Skip to content

Commit

Permalink
assignment sklearn
Browse files Browse the repository at this point in the history
  • Loading branch information
Lauriane Mousset committed Dec 22, 2023
1 parent 6ccb1be commit 45f8422
Showing 1 changed file with 38 additions and 11 deletions.
49 changes: 38 additions & 11 deletions sklearn_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
to compute distances between 2 sets of samples.
"""
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
Expand All @@ -59,6 +58,8 @@
from sklearn.utils.validation import check_array
from sklearn.utils.multiclass import check_classification_targets
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import accuracy_score


class KNearestNeighbors(BaseEstimator, ClassifierMixin):
Expand All @@ -82,6 +83,12 @@ def fit(self, X, y):
self : instance of KNearestNeighbors
The current instance of the classifier
"""
X, y = check_X_y(X, y)
check_classification_targets(y)
self.X_ = X
self.y_ = y
self.classes_ = unique_labels(y)
self.n_features_in_ = X.shape[1]
return self

def predict(self, X):
Expand All @@ -97,7 +104,18 @@ def predict(self, X):
y : ndarray, shape (n_test_samples,)
Predicted class labels for each test data sample.
"""
y_pred = np.zeros(X.shape[0])
X = check_array(X)
check_is_fitted(self)
distances = pairwise_distances(X, self.X_)
ord = np.argsort(distances, axis=1)[:, : self.n_neighbors]
ord = self.y_[ord]
y_pred = np.apply_along_axis(
lambda x: np.unique(x, return_counts=True)[0][
np.argmax(np.unique(x, return_counts=True)[1])
],
axis=1,
arr=ord,
)
return y_pred

def score(self, X, y):
Expand All @@ -115,7 +133,8 @@ def score(self, X, y):
score : float
Accuracy of the model computed for the (X, y) pairs.
"""
return 0.
y_pred = self.predict(X)
return accuracy_score(y_pred, y)


class MonthlySplit(BaseCrossValidator):
Expand All @@ -134,7 +153,7 @@ class MonthlySplit(BaseCrossValidator):
To use the index as column just set `time_col` to `'index'`.
"""

def __init__(self, time_col='index'): # noqa: D107
def __init__(self, time_col="index"): # noqa: D107
self.time_col = time_col

def get_n_splits(self, X, y=None, groups=None):
Expand All @@ -155,7 +174,14 @@ def get_n_splits(self, X, y=None, groups=None):
n_splits : int
The number of splits.
"""
return 0
if self.time_col == "index":
X = X.reset_index()
self.X_time = X[self.time_col]

if X[self.time_col].dtype != "datetime64[ns]":
raise ValueError("datetime")

return len(self.X_time.dt.to_period("M").unique()) - 1

def split(self, X, y, groups=None):
"""Generate indices to split data into training and test set.
Expand All @@ -178,11 +204,12 @@ def split(self, X, y, groups=None):
The testing set indices for that split.
"""

n_samples = X.shape[0]
n_splits = self.get_n_splits(X, y, groups)
X = X.reset_index()

isorted = X.resample("M", on=self.time_col).apply(lambda x: x.index)
isorted.index = isorted.index.to_period("M")
for i in range(n_splits):
idx_train = range(n_samples)
idx_test = range(n_samples)
yield (
idx_train, idx_test
)
idx_train = np.array(isorted[isorted.index[i]])
idx_test = np.array(isorted[isorted.index[i + 1]])
yield (idx_train, idx_test)

0 comments on commit 45f8422

Please sign in to comment.