Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Premier_commit #156

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 45 additions & 23 deletions sklearn_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,40 +48,44 @@
to compute distances between 2 sets of samples.
"""
import numpy as np
import pandas as pd

from sklearn.utils.multiclass import unique_labels
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin

from sklearn.model_selection import BaseCrossValidator

from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.utils.validation import check_array
from sklearn.utils.multiclass import check_classification_targets
from sklearn.metrics.pairwise import pairwise_distances
from statistics import mode


class KNearestNeighbors(BaseEstimator, ClassifierMixin):
"""KNearestNeighbors classifier."""

def __init__(self, n_neighbors=1): # noqa: D107
"""Init."""
self.n_neighbors = n_neighbors

def fit(self, X, y):
"""Fitting function.
"""Predict function.

Parameters

Parameters
----------
X : ndarray, shape (n_samples, n_features)
Data to train the model.
y : ndarray, shape (n_samples,)
Labels associated with the training data.
X : ndarray, shape (n_test_samples, n_features)
Data to predict on.

Returns
----------
self : instance of KNearestNeighbors
The current instance of the classifier
y : ndarray, shape (n_test_samples,)
Predicted class labels for each test data sample.
"""
check_classification_targets(y)
X, y = check_X_y(X, y)
self.classes_ = unique_labels(y)
self.n_features_in_ = X.shape[1]
self.X_ = X
self.y_ = y
return self

def predict(self, X):
Expand All @@ -97,7 +101,13 @@ def predict(self, X):
y : ndarray, shape (n_test_samples,)
Predicted class labels for each test data sample.
"""
y_pred = np.zeros(X.shape[0])
check_is_fitted(self)
check_array(X)
distances = pairwise_distances(X, self.X_, metric='euclidean')
ordered_index = np.argsort(distances, axis=1)
k_neig = self.y_[ordered_index[:, :self.n_neighbors]]
y_pred = np.array([mode(row) for row in k_neig])

return y_pred

def score(self, X, y):
Expand All @@ -109,13 +119,18 @@ def score(self, X, y):
Data to score on.
y : ndarray, shape (n_samples,)
target values.

Returns
----------
score : float
Accuracy of the model computed for the (X, y) pairs.
"""
return 0.
check_is_fitted(self)
check_array(X)
check_classification_targets(y)
yhat = self.predict(X)
score = float(np.where(yhat == y, 1, 0).mean())

return score


class MonthlySplit(BaseCrossValidator):
Expand All @@ -134,7 +149,7 @@ class MonthlySplit(BaseCrossValidator):
To use the index as column just set `time_col` to `'index'`.
"""

def __init__(self, time_col='index'): # noqa: D107
def __init__(self, time_col="index"): # noqa: D107
self.time_col = time_col

def get_n_splits(self, X, y=None, groups=None):
Expand All @@ -155,7 +170,11 @@ def get_n_splits(self, X, y=None, groups=None):
n_splits : int
The number of splits.
"""
return 0
X = X.reset_index()
if X.loc[:, self.time_col].dtype != "datetime64[ns]":
raise ValueError(f"{self.time_col} should be of type datetime")

return np.unique(X[self.time_col].dt.strftime("%Y-%m")).shape[0] - 1

def split(self, X, y, groups=None):
"""Generate indices to split data into training and test set.
Expand All @@ -177,12 +196,15 @@ def split(self, X, y, groups=None):
idx_test : ndarray
The testing set indices for that split.
"""
# On change l'index
X = X.reset_index()

n_samples = X.shape[0]
# Génération des splits
n_splits = self.get_n_splits(X, y, groups)
month = X[self.time_col].dt.strftime("%Y-%m")
list_months = np.sort(np.unique(month))
for i in range(n_splits):
idx_train = range(n_samples)
idx_test = range(n_samples)
yield (
idx_train, idx_test
)
idx_train = np.where(month == list_months[i])[0]
idx_test = np.where(month == list_months[i + 1])[0]

yield (idx_train, idx_test)
Loading