forked from LukasKG/SHL_GAN
-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessing.py
134 lines (108 loc) · 3.95 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
from sklearn import preprocessing
import numpy as np
import torch
from log import log
import data_source as ds
def get_labels():
""" Returns list with unique labels """
return np.fromiter(ds.LABELS_SHL.keys(), dtype=int)
def get_size(params):
''' Returns the input shape and number of outputclasses of a dataset '''
X = 20*len(ds.FEATURES[params['FX_sel']])
Y = get_labels().shape[0]
return [X,Y]
def scale_minmax(X):
''' Scale data between -1 and 1 to fit the Generators tanh output '''
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1), copy=True)
scaler.fit(X)
return scaler.transform(X)
def select_random(X0,Y0,ratio):
''' Select random samples based on a given target ratio '''
ix = np.random.choice(len(X0), size=int(ratio*len(X0)), replace=False)
X1 = X0[ix]
if Y0 is None:
Y1 = None
else:
Y1 = Y0[ix]
return X1, Y1
from imblearn.over_sampling import SMOTE
def over_sampling(params,X,Y):
labels = one_hot_to_labels(params,Y)
smote = SMOTE(sampling_strategy='not majority',k_neighbors=5)
data, labels = smote.fit_sample(X, labels)
return data, labels_to_one_hot(params,labels)
from sklearn.model_selection import train_test_split
def split_data(X,Y):
return train_test_split(X, Y, test_size=0.5)
def get_one_hot_labels(params,num):
''' Turns a list with label indeces into a one-hot label array '''
labels = np.random.choice(params['label'], size=num, replace=True, p=None)
return labels_to_one_hot(params,labels)
def labels_to_one_hot(params,labels):
Y = np.zeros((labels.shape[0],params['label'].shape[0]))
for i in range(labels.shape[0]):
j = np.where(params['label']==labels[i])[0][0]
Y[i,j] = 1
return Y
def one_hot_to_labels(params,Y):
if torch.is_tensor(Y):
Y = Y.detach().cpu().numpy()
return np.array([params['label'][np.where(oh==max(oh))[0][0]] for oh in Y])
def get_data(params,dataset):
X, Y = ds.load_data(params,dataset)
if Y is not None:
Y = labels_to_one_hot(params,Y)
return X,Y
def get_prediction(params,src_path):
pred = ds.read_prediction(params,src_path)
if pred is not None:
return labels_to_one_hot(params,pred)
else:
return None
def get_tensor(X, Y=None):
if not torch.is_tensor(X):
cuda = True if torch.cuda.is_available() else False
if cuda:
device = torch.device('cuda')
else:
device = torch.device('cpu')
X = torch.from_numpy(X).float().to(device)
if Y is not None:
Y = torch.from_numpy(Y).float().to(device)
return X, Y
class Permanent_Dataloader:
def __init__(self, dataloader):
self.dataloader = dataloader
self.iterator = iter(self.dataloader)
def get_next(self):
try:
data = next(self.iterator)
except StopIteration:
# StopIteration is thrown if dataset ends
# reinitialize data loader
self.iterator = iter(self.dataloader)
data = next(self.iterator)
return data
def get_dataloader(params,X,Y,batch_size=None):
# transform to torch tensors
if not torch.is_tensor(X):
X, Y = get_tensor(X,Y)
# create your datset
if Y is not None:
dataset = torch.utils.data.TensorDataset(X,Y)
else:
dataset = torch.utils.data.TensorDataset(X)
if batch_size == None:
batch_size = params['batch_size']
# Configure data loader
dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=batch_size,
shuffle=True,
)
return dataloader
def get_perm_dataloader(params,X,Y):
dataloader = get_dataloader(params,X,Y)
perm_dataloader = Permanent_Dataloader(dataloader)
return perm_dataloader