-
Notifications
You must be signed in to change notification settings - Fork 0
/
decisionTreeAndForestOnResumeFiltering.py
71 lines (35 loc) · 1.5 KB
/
decisionTreeAndForestOnResumeFiltering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
import pandas as pd
#used to reduce entropy down the decision. Entropy is how uniform or related elements in a dataset are.
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
input_file= 'PastHires.csv'
df= pd.read_csv(input_file)
print(df.head())
#to convert Yes No values to numerical values
yesNo= {"Y" :1, "N":0}
df["Hired"]= df["Hired"].map(yesNo)
df["Employed?"]= df["Employed?"].map(yesNo)
df["Top-tier school"]= df["Top-tier school"].map(yesNo)
df["Interned"]= df["Interned"].map(yesNo)
#the same for the levels of education.
education={"BS":0, "MS":1, "PhD": 2}
df["Level of Education"]= df["Level of Education"].map(education)
print(df.head())
#extract the features from the data that will be used for prediction.
features= list(df.columns[:6])
print(features)
y=df["Hired"]
x= df[features]
classifier=tree.DecisionTreeClassifier()
classifier= classifier.fit(x,y)
#TODOs check on how to generate the decision tree image.
print("predictions ", classifier.predict([[10,1,4,0,0,0],[10,0,4,0,0,0]]))
#to prevent overfiting Random forest are used to make the decision.
#The forest consists of many trees in which each acts on random sample of the
#data . On the overall decision each tree makes a vote which will contribute
#to overall decisions
classifier= RandomForestClassifier(n_estimators=10)
classifier= classifier.fit(x,y)
print("forest predictions",classifier.predict([[10,1,4,0,0,0],[10,0,4,0,0,0]]))