-
Notifications
You must be signed in to change notification settings - Fork 11
/
data.py
123 lines (99 loc) · 4.85 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit
from typing import List, Optional, Tuple
class TimeSeriesDataset:
"""
A class for preprocessing and loading time series data for pytorch models.
attributes:
data (pd.DataFrame): the input time series data.
categorical_cols (List[str]): list of categorical column names.
target_col (str): name of the target column.
seq_length (int): length of the input sequence.
prediction_window (int): length of the prediction window.
numerical_cols (List[str]): list of numerical column names.
preprocessor (ColumnTransformer): sklearn preprocessor for data transformation.
"""
def __init__(self,
data: pd.DataFrame,
categorical_cols: List[str],
target_col: str,
seq_length: int,
prediction_window: int = 1):
"""
Initialize the TimeSeriesDataset.
args:
data (pd.DataFrame): the input time series data.
categorical_cols (List[str]): list of categorical column names.
target_col (str): name of the target column.
seq_length (int): length of the input sequence.
prediction_window (int): length of the prediction window.
"""
self.data = data
self.categorical_cols = categorical_cols
self.numerical_cols = list(set(data.columns) - set(categorical_cols) - {target_col})
self.target_col = target_col
self.seq_length = seq_length
self.prediction_window = prediction_window
self.preprocessor = None
def preprocess_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""
Preprocess the data using sklearn ColumnTransformer.
returns:
Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: preprocessed training and testing data.
"""
X = self.data.drop(self.target_col, axis=1)
y = self.data[self.target_col]
self.preprocessor = ColumnTransformer(
[("scaler", StandardScaler(), self.numerical_cols),
("encoder", OneHotEncoder(sparse=False, handle_unknown='ignore'), self.categorical_cols)],
remainder="passthrough"
)
# use timeseriessplit for time series data :cite[c4]
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
X_train = self.preprocessor.fit_transform(X_train)
X_test = self.preprocessor.transform(X_test)
return X_train, X_test, y_train.values, y_test.values
def frame_series(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> TensorDataset:
"""
Create a TensorDataset from the input data.
args:
X (np.ndarray): input features.
y (Optional[np.ndarray]): target values.
returns:
TensorDataset: dataset containing the framed series.
"""
nb_obs, nb_features = X.shape
features, target, y_hist = [], [], []
for i in range(nb_obs - self.seq_length - self.prediction_window + 1):
features.append(torch.FloatTensor(X[i:i + self.seq_length, :]))
features_var = torch.stack(features)
if y is not None:
for i in range(nb_obs - self.seq_length - self.prediction_window + 1):
target.append(torch.FloatTensor(y[i + self.seq_length:i + self.seq_length + self.prediction_window]))
y_hist.append(
torch.FloatTensor(y[i + self.seq_length - 1:i + self.seq_length + self.prediction_window - 1]))
target_var, y_hist_var = torch.stack(target), torch.stack(y_hist)
return TensorDataset(features_var, target_var, y_hist_var)
return TensorDataset(features_var)
def get_loaders(self, batch_size: int) -> Tuple[DataLoader, DataLoader]:
"""
Create DataLoader objects for training and testing data.
args:
batch_size (int): size of each batch.
returns:
Tuple[DataLoader, DataLoader]: DataLoader objects for training and testing data.
"""
X_train, X_test, y_train, y_test = self.preprocess_data()
train_dataset = self.frame_series(X_train, y_train)
test_dataset = self.frame_series(X_test, y_test)
train_iter = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_iter = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
return train_iter, test_iter