-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
kaggle_data.py
64 lines (53 loc) · 1.76 KB
/
kaggle_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
Utility functions to load Kaggle Otto Group Challenge Data.
Since these data/functions are used in many notebooks, it is better
to centralise functions to load and manipulate data so
to not replicate code.
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
def load_data(path, train=True):
"""Load data from a CSV File
Parameters
----------
path: str
The path to the CSV file
train: bool (default True)
Decide whether or not data are *training data*.
If True, some random shuffling is applied.
Return
------
X: numpy.ndarray
The data as a multi dimensional array of floats
ids: numpy.ndarray
A vector of ids for each sample
"""
df = pd.read_csv(path)
X = df.values.copy()
if train:
np.random.shuffle(X) # https://youtu.be/uyUXoap67N8
X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
return X, labels
else:
X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
return X, ids
def preprocess_data(X, scaler=None):
"""Preprocess input data by standardise features
by removing the mean and scaling to unit variance"""
if not scaler:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
return X, scaler
def preprocess_labels(labels, encoder=None, categorical=True):
"""Encode labels with values among 0 and `n-classes-1`"""
if not encoder:
encoder = LabelEncoder()
encoder.fit(labels)
y = encoder.transform(labels).astype(np.int32)
if categorical:
y = np_utils.to_categorical(y)
return y, encoder