-
Notifications
You must be signed in to change notification settings - Fork 7
/
understanding_data.py
106 lines (75 loc) · 3.22 KB
/
understanding_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 15 16:15:49 2018
@author: abinaya
"""
import pandas as pd
import matplotlib.pyplot as plt
plt.close("all")
class understanding_data:
def __init__(self, raw_df):
self.raw_df = raw_df
self.raw_df_grouped = raw_df.groupby("y")
self.class_name_no = "no"
self.class_name_yes = "yes"
self.raw_df_grouped_no = self.raw_df_grouped.get_group(self.class_name_no)
self.raw_df_grouped_yes = self.raw_df_grouped.get_group(self.class_name_yes)
def plot_histogram_continuous(self, feature_name, bin_size):
plt.figure()
plt.hist(self.raw_df_grouped_no[feature_name], bins=bin_size, label=self.class_name_no)
plt.hist(self.raw_df_grouped_yes[feature_name], bins=bin_size, label=self.class_name_yes)
plt.legend()
plt.title("Feature Histogram - "+feature_name)
plt.xlabel("Feature values")
plt.ylabel("Count")
def plot_histogram_categorical(self, feature_name):
feature_df = pd.DataFrame()
feature_df["no"] = self.raw_df_grouped_no[feature_name].value_counts()
feature_df["yes"] = self.raw_df_grouped_yes[feature_name].value_counts()
feature_df.plot(kind='bar')
plt.title("Feature Histogram - "+feature_name)
plt.ylabel("Count")
plt.xlabel("Feature unique values")
plt.tight_layout()
### Read csv and get grouped df based on class
raw_df = pd.read_csv('Data/bank-additional.csv')
understanding_data_obj = understanding_data(raw_df)
### Feature 1 - AGE
understanding_data_obj.plot_histogram_continuous("age", 50)
### Feature 2 - JOB
understanding_data_obj.plot_histogram_categorical("job")
### Feature 3 - MARITAL
understanding_data_obj.plot_histogram_categorical("marital")
### Feature 4 - EDUCATION
understanding_data_obj.plot_histogram_categorical("education")
### Feature 5 - DEFAULT
understanding_data_obj.plot_histogram_categorical("default")
### Feature 6 - HOUSING
understanding_data_obj.plot_histogram_categorical("housing")
### Feature 7 - LOAN
understanding_data_obj.plot_histogram_categorical("loan")
### Feature 8 - CONTACT
understanding_data_obj.plot_histogram_categorical("contact")
### Feature 9 - MONTH
understanding_data_obj.plot_histogram_categorical("month")
### Feature 10 - DAY OF WEEK
understanding_data_obj.plot_histogram_categorical("day_of_week")
### Feature 11 - CAMPAIGN
understanding_data_obj.plot_histogram_continuous("campaign", 30)
### Feature 12 - PDAYS
understanding_data_obj.plot_histogram_continuous("pdays", 30)
### Feature 13 - PREVIOUS
understanding_data_obj.plot_histogram_categorical("previous")
### Feature 14 - POUTCOME
understanding_data_obj.plot_histogram_categorical("poutcome")
### Feature 15 - emp.var.rate
understanding_data_obj.plot_histogram_continuous("emp.var.rate", 50)
### Feature 16 - cons.price.idx
understanding_data_obj.plot_histogram_continuous("cons.price.idx", 50)
### Feature 17 - cons.conf.idx
understanding_data_obj.plot_histogram_continuous("cons.conf.idx", 50)
### Feature 18 - euribor3m
understanding_data_obj.plot_histogram_continuous("euribor3m", 50)
### Feature 19 - nr.employed
understanding_data_obj.plot_histogram_continuous("nr.employed", 50)