-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
104 lines (88 loc) · 4.27 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import os
from student_utils import create_tf_numeric_feature
def aggregate_dataset(df, grouping_field_list, array_field):
df = df.groupby(grouping_field_list)['encounter_id',
array_field].apply(lambda x: x[array_field].values.tolist()).reset_index().rename(columns={
0: array_field + "_array"})
dummy_df = pd.get_dummies(df[array_field + '_array'].apply(pd.Series).stack()).sum(level=0)
dummy_col_list = [x.replace(" ", "_") for x in list(dummy_df.columns)]
mapping_name_dict = dict(zip([x for x in list(dummy_df.columns)], dummy_col_list ) )
concat_df = pd.concat([df, dummy_df], axis=1)
new_col_list = [x.replace(" ", "_") for x in list(concat_df.columns)]
concat_df.columns = new_col_list
return concat_df, dummy_col_list
def cast_df(df, col, d_type=str):
return df[col].astype(d_type)
def impute_df(df, col, impute_value=0):
return df[col].fillna(impute_value)
def preprocess_df(df, categorical_col_list, numerical_col_list, predictor, categorical_impute_value='nan', numerical_impute_value=0):
df[predictor] = df[predictor].astype(float)
for c in categorical_col_list:
df[c] = cast_df(df, c, d_type=str)
for numerical_column in numerical_col_list:
df[numerical_column] = impute_df(df, numerical_column, numerical_impute_value)
return df
#adapted from https://www.tensorflow.org/tutorials/structured_data/feature_columns
def df_to_dataset(df, predictor, batch_size=32):
df = df.copy()
labels = df.pop(predictor)
ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
ds = ds.shuffle(buffer_size=len(df))
ds = ds.batch(batch_size)
return ds
# build vocab for categorical features
def write_vocabulary_file(vocab_list, field_name, default_value, vocab_dir='./diabetes_vocab/'):
output_file_path = os.path.join(vocab_dir, str(field_name) + "_vocab.txt")
# put default value in first row as TF requires
vocab_list = np.insert(vocab_list, 0, default_value, axis=0)
df = pd.DataFrame(vocab_list).to_csv(output_file_path, index=None, header=None)
return output_file_path
def build_vocab_files(df, categorical_column_list, default_value='00'):
vocab_files_list = []
for c in categorical_column_list:
v_file = write_vocabulary_file(df[c].unique(), c, default_value)
vocab_files_list.append(v_file)
return vocab_files_list
def show_group_stats_viz(df, group):
print(df.groupby(group).size())
print(df.groupby(group).size().plot(kind='barh'))
'''
Adapted from Tensorflow Probability Regression tutorial https://github.com/tensorflow/probability/blob/master/tensorflow_probability/examples/jupyter_notebooks/Probabilistic_Layers_Regression.ipynb
'''
def posterior_mean_field(kernel_size, bias_size=0, dtype=None):
n = kernel_size + bias_size
c = np.log(np.expm1(1.))
return tf.keras.Sequential([
tfp.layers.VariableLayer(2*n, dtype=dtype),
tfp.layers.DistributionLambda(lambda t: tfp.distributions.Independent(
tfp.distributions.Normal(loc=t[..., :n],
scale=1e-5 + tf.nn.softplus(c + t[..., n:])),
reinterpreted_batch_ndims=1)),
])
def prior_trainable(kernel_size, bias_size=0, dtype=None):
n = kernel_size + bias_size
return tf.keras.Sequential([
tfp.layers.VariableLayer(n, dtype=dtype),
tfp.layers.DistributionLambda(lambda t: tfp.distributions.Independent(
tfp.distributions.Normal(loc=t, scale=1),
reinterpreted_batch_ndims=1)),
])
def demo(feature_column, example_batch):
feature_layer = tf.keras.layers.DenseFeatures(feature_column)
print(feature_layer(example_batch))
return feature_layer(example_batch)
def calculate_stats_from_train_data(df, col):
mean = df[col].describe()['mean']
std = df[col].describe()['std']
return mean, std
def create_tf_numerical_feature_cols(numerical_col_list, train_df):
tf_numeric_col_list = []
for c in numerical_col_list:
mean, std = calculate_stats_from_train_data(train_df, c)
tf_numeric_feature = create_tf_numeric_feature(c, mean, std)
tf_numeric_col_list.append(tf_numeric_feature)
return tf_numeric_col_list