Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Up my solution #107

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 81 additions & 7 deletions sklearn_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
"""
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
Expand All @@ -61,6 +62,20 @@
from sklearn.metrics.pairwise import pairwise_distances


def most_common_label(array):
"""_summary_
Args:
array : the array of labels
Returns:
most_common : the most common label in the array
"""
counter = Counter(array)
most_common = counter.most_common(1)
# If there is a tie for the most common label, this will return the first \
# one encountered.
return most_common[0][0] if most_common else None


class KNearestNeighbors(BaseEstimator, ClassifierMixin):
"""KNearestNeighbors classifier."""

Expand All @@ -82,6 +97,12 @@ def fit(self, X, y):
self : instance of KNearestNeighbors
The current instance of the classifier
"""
X, y = check_X_y(X, y)
check_classification_targets(y)
self.X_ = X
self.y_ = y
self.n_features_in_ = X.shape[1]
self.classes_ = np.unique(y)
return self

def predict(self, X):
Expand All @@ -97,7 +118,12 @@ def predict(self, X):
y : ndarray, shape (n_test_samples,)
Predicted class labels for each test data sample.
"""
y_pred = np.zeros(X.shape[0])
check_is_fitted(self)
X = check_array(X)
closest = np.argsort(pairwise_distances(X, self.X_))[:,
:self.n_neighbors]
y_pred = np.array([most_common_label(self.y_[closest[i]]) for i in
range(X.shape[0])])
return y_pred

def score(self, X, y):
Expand All @@ -115,7 +141,10 @@ def score(self, X, y):
score : float
Accuracy of the model computed for the (X, y) pairs.
"""
return 0.
check_classification_targets(y)
y_pred = self.predict(X)
score = (y == y_pred).mean()
return score


class MonthlySplit(BaseCrossValidator):
Expand Down Expand Up @@ -155,7 +184,14 @@ def get_n_splits(self, X, y=None, groups=None):
n_splits : int
The number of splits.
"""
return 0
if self.time_col == 'index':
unique_year_month = set([(date.year, date.month)
for date in X.index])
else:
unique_year_month = set([(date.year, date.month)
for date in X[self.time_col]])
n_splits = len(unique_year_month)-1
return n_splits

def split(self, X, y, groups=None):
"""Generate indices to split data into training and test set.
Expand All @@ -177,12 +213,50 @@ def split(self, X, y, groups=None):
idx_test : ndarray
The testing set indices for that split.
"""

n_samples = X.shape[0]
if self.time_col == 'index':
if not isinstance(X.index, pd.DatetimeIndex):
raise ValueError("Index of the DataFrame is not datetime")
df = X.index.map(lambda date: (date.year, date.month))
else:
if not isinstance(X[self.time_col], pd.Series) or \
X[self.time_col].dtype != 'datetime64[ns]':
raise ValueError("The splitting column is not a datetime")
df = X[self.time_col].apply(lambda date: (date.year, date.month))
unique_year_month = sorted(set(df))
n_splits = self.get_n_splits(X, y, groups)
assert len(unique_year_month) == n_splits+1, "The number \
of unique year-month pairs must be equal to the n_split+1"
for i in range(n_splits):
idx_train = range(n_samples)
idx_test = range(n_samples)
idx_train = [X.index.get_loc(elm)
for elm in X[df == unique_year_month[i]]
.index.tolist()]
idx_test = [X.index.get_loc(elm)
for elm in X[df == unique_year_month[i+1]]
.index.tolist()]
yield (
idx_train, idx_test
)
df = None


def main():
# Create a DataFrame with a datetime column
date_range = pd.date_range(start='1/1/2020', end='1/1/2022', freq='D')
df = pd.DataFrame({
'value': np.random.rand(len(date_range))
}, index=date_range)
y = pd.DataFrame({
'label': np.random.randint(1, 4, df.shape[0])
}, index=date_range)
spliter = MonthlySplit()
# Use your generator function to get the training and test indices
# for each split
for i, (idx_train, idx_test) in enumerate(spliter.split(df, y)):
print(f"Training indices: {df.iloc[idx_train]}")
print(f"Test indices: {df.iloc[idx_test]}")
if i == 4:
break


if __name__ == '__main__':
main()
Loading