Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Final Solution Sklearn Task #103

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 149 additions & 19 deletions sklearn_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
to compute distances between 2 sets of samples.
"""
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
Expand All @@ -60,6 +59,9 @@
from sklearn.utils.multiclass import check_classification_targets
from sklearn.metrics.pairwise import pairwise_distances

from pandas.api.types import is_datetime64_any_dtype
from collections import Counter


class KNearestNeighbors(BaseEstimator, ClassifierMixin):
"""KNearestNeighbors classifier."""
Expand All @@ -73,49 +75,162 @@ def fit(self, X, y):
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Data to train the model.
Data to train the model (X_train).
y : ndarray, shape (n_samples,)
Labels associated with the training data.
Labels associated with the training data (y_train).

Returns
----------
self : instance of KNearestNeighbors
The current instance of the classifier
"""
# Checks
"""
The check_classification_targets function ensures that target y is of
a non-regression type. Only the following target types
(as defined in type_of_target) are allowed: 'binary', 'multiclass',
'multiclass-multioutput', 'multilabel-indicator',
'multilabel-sequences'.
"""
check_classification_targets(y)
"""
The check_X_y function performs an input validation for standard
estimators (=models). It checks X and y for consistent length,
enforces X to be 2D and y 1D. By default, X is checked to be non-empty
and containing only finite values. Standard input checks are also
applied to y, such as checking that y does not have np.nan or np.inf
targets.For multi-label y, set multi_output=True to allow 2D and
sparse y. If the dtype of X is object, attempt converting to
float, raising on failure.
"""
X, y = check_X_y(X, y)

# Number of instances
self.n_features_in_ = X.shape[1]
# Classes of y
self.classes_ = np.unique(y)
# Instance of X and y as objects of the class
self.X_ = X
self.y_ = y

return self

def predict(self, X):
"""Predict function.
"""
Predict function.

Parameters
----------
X : ndarray, shape (n_test_samples, n_features)
Data to predict on.
Data to predict on (X_test).

Returns
----------
y : ndarray, shape (n_test_samples,)
Predicted class labels for each test data sample.
Predicted class labels for each test data sample (y_test).
"""
"""
The check_is_fitted function is a sklearn.utils.validation function
used to check whether an estimator (such as a classifier or regressor)
has been fitted, i.e. whether it has been trained on input data.
If the estimator has not been fitted, check_is_fitted will throw
an error.
"""
check_is_fitted(self)
"""
The check_array function is a sklearn.utils.validation function used
to validate whether an input array is suitable for use in scikit-learn
estimators. This function checks several things, such as whether the
array is numeric, whether it has a specific number of dimensions
(e.g. 2D for arrays), and whether it contains missing values
(NaN or infinite), among other checks. If the array does not meet
these requirements, check_array will throw an error.
"""
check_array(X)
# Calculate pairwise distances
"""
pairwise_distances(X, self.X_):
This function computes the distance from each sample in X (X_test)
to every sample in self.X_ (X_train). X is the data for which
predictions are being made, and self.X_ is the training data that
the model has been fitted on. The result is a distance matrix
dist_matrix where each entry [i, j] represents the distance between
the i-th sample in X and the j-th sample in self.X_.
"""
dist_matrix = pairwise_distances(X, self.X_)
# Find Indices of Nearest Neighbors
"""
y_pred = np.zeros(X.shape[0])
return y_pred
np.argsort(dist_matrix, axis=1):
This function sorts each row of dist_matrix in ascending order and
returns the indices of the sorted elements. The sorting is done
row-wise, meaning for each sample in X, we get the indices of the
training samples (self.X_) in order of increasing distance.

[:, :self.n_neighbors]: This slicing operation takes the first
self.n_neighbors indices for each row. These are the indices of
the nearest neighbors.
"""
dist_sort_pos = np.argsort(dist_matrix, axis=1)[:, :self.n_neighbors]
# Find Indices of Nearest Neighbors
"""
np.argsort(dist_matrix, axis=1):
This function sorts each row of dist_matrix in ascending order and
returns the indices of the sorted elements. The sorting is done
row-wise, meaning for each sample in X, we get the indices of the
training samples (self.X_) in order of increasing distance.

[:, :self.n_neighbors]:
This slicing operation takes the first self.n_neighbors
indices for each row. These are the indices of the nearest neighbors.
"""
# Get labels of nearest neighbors
"""
self.y_ (y_train) is the array of labels corresponding to the training
data self.X_, and self.y_[dist_sort_pos] uses the indices in
dist_sort_pos to gather the labels of the nearest neighbors for
each sample in X.
"""
y_closest = self.y_[dist_sort_pos]
# Determine predicted values
"""
This line predicts the label for each sample in X based on the
majority vote among its nearest neighbors.

Counter(row):
For each row in y_closest, a Counter object is
created to count the frequency of each label among the nearest
neighbors.

max(Counter(row), key=Counter(row).get):
This finds the label with the highest frequency (the most
common label) among the nearest neighbors for each sample in X.
"""
y_pred = [max(Counter(row), key=Counter(row).get) for row in y_closest]

return np.array(y_pred)

def score(self, X, y):
"""Calculate the score of the prediction.

Parameters
----------
X : ndarray, shape (n_samples, n_features)
Data to score on.
Data to score on (X_test).
y : ndarray, shape (n_samples,)
target values.
target values (Y_test).

Returns
----------
score : float
Accuracy of the model computed for the (X, y) pairs.
"""
return 0.
check_is_fitted(self)
check_array(X)
check_classification_targets(y)
preds = self.predict(X)

# pres.shape[0] is the number of instances predicted, like len
return (preds == y).sum() / preds.shape[0]


class MonthlySplit(BaseCrossValidator):
Expand Down Expand Up @@ -155,7 +270,15 @@ def get_n_splits(self, X, y=None, groups=None):
n_splits : int
The number of splits.
"""
return 0
X = X.reset_index()

if not is_datetime64_any_dtype(X[self.time_col]):
raise ValueError("Not in a datetimeFormat")

date = X[self.time_col]
date_y_m = date.apply(lambda x: str(x.year) + str(x.month))

return date_y_m.unique().shape[0] - 1

def split(self, X, y, groups=None):
"""Generate indices to split data into training and test set.
Expand All @@ -177,12 +300,19 @@ def split(self, X, y, groups=None):
idx_test : ndarray
The testing set indices for that split.
"""

n_samples = X.shape[0]
X = X.reset_index()
X = X.sort_values(by=self.time_col)
n_splits = self.get_n_splits(X, y, groups)
date = X[self.time_col]
date_y_m = date.apply(lambda x: str(x.year) + str(x.month)).unique()
for i in range(n_splits):
idx_train = range(n_samples)
idx_test = range(n_samples)
yield (
idx_train, idx_test
)
year, month = int(date_y_m[i][0:4]), int(date_y_m[i][4:])
train_idx = X[
(date.dt.year == year) & (date.dt.month == month)
].index.to_numpy()
year, month = int(date_y_m[i+1][0:4]), int(date_y_m[i+1][4:])
test_idx = X[
(date.dt.year == year) & (date.dt.month == month)
].index.to_numpy()

yield train_idx, test_idx
Loading