Skip to content

Software to build Decision Trees for imbalanced data. To cite this Original Software Publication: https://www.sciencedirect.com/science/article/pii/S2352711021001242

License

Notifications You must be signed in to change notification settings

ElsevierSoftwareX/SOFTX-D-20-00097

 
 

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

12 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

ImbTreeEntropy

Software to build Decision Trees for imbalanced data.

Installation

You can install the released version of ImbTreeEntropy from GitHub with:

library(devtools)
install_github("KrzyGajow/ImbTreeEntropy")

Example

This is a basic example which shows you how to solve a common problem:

library("ImbTreeEntropy")
library("caret")

data(iris)

# Original dataset, multiclass classification, only numeric attributes
iris

# Predicting 1 as 0 will be penalized 5 times
class_cost_bin <- matrix( c(0,5,1,0), 2, 2, dimnames = list( 0:1, 0:1 ) )

# Predicting Setosa is very easy, Versicolor will have cost 5 for Virginica, Virginica will have cost 10 for Versicolor
class_cost_mult <- matrix( c(0,1,1,1,0,10,1,5,0), 3, 3, dimnames = list( levels(iris$Species), levels(iris$Species) ) )

# Assigning higher weights to those observation which are hard to correctly predict
obs_weights <- c( rep(1, 50), c( rep(1, 20), 5, rep(1, 6), 5, rep(1, 22) ), c( rep(1, 19), 10, rep(1, 9), 10, rep(1, 3), 10, 10, rep(1 ,15) ) )

# Dataset for binary classification, only numeric attributes
iris_2 <- iris
iris_2$Species <- factor( rep(0:1, each = 75) )

# Dataset for binary classification, with one unordered factor attribute
iris_3 <- iris_2
iris_3$Petal.Length <- factor( iris_3$Petal.Length )

# Dataset for binary classification, with one ordered factor attribute
iris_4 <- iris_2
iris_4$Petal.Length <- factor( iris_4$Petal.Length, ordered = T )


# Simulation 1: Default settings
Tree1 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, min_obs = 5, 
                        type = "Shannon", entropy_par = 1, cp = 0, n_cores = 1, weights = NULL, cost = NULL, 
                        class_th = "equal", overfit = "leafcut", cf = 0.25)

PrintTree(Tree1)
Tree1_pred <- PredictTree(Tree1, iris)
confusionMatrix( Tree1_pred$Class, iris$Species )

# Simulation 2: Original dataset, adding cost matrix
Tree2 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, min_obs = 5, 
                        type = "Shannon", entropy_par = 1, cp = 0, n_cores = 1, weights = NULL, cost = class_cost_mult, 
                        class_th = "equal", overfit = "leafcut", cf = 0.25)

PrintTree(Tree2)
Tree2_pred <- PredictTree(Tree2, iris)
confusionMatrix( Tree2_pred$Class, iris$Species )

# Simulation 3: Original dataset, adding observation weights
Tree3 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, min_obs = 5, 
                        type = "Shannon", entropy_par = 1, cp = 0, n_cores = 1, weights = obs_weights, cost = NULL, 
                        class_th = "equal", overfit = "leafcut", cf = 0.25)

PrintTree(Tree3)
Tree3_pred <- PredictTree(Tree3, iris)
confusionMatrix( Tree3_pred$Class, iris$Species )

# Simulation 4: Original dataset, tunned thresholds
Tree4 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, min_obs = 5, 
                        type = "Shannon", entropy_par = 1, cp = 0, n_cores = 1, weights = NULL, cost = NULL, 
                        class_th = "tuned", overfit = "none", cf = 0.25)

PrintTree(Tree4)
Tree4_pred <- PredictTree(Tree4, iris)
confusionMatrix( Tree4_pred$Class, iris$Species )

# Simulation 5: Original dataset, Renyi entropy with q = 2
Tree5 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, min_obs = 5, 
                        type = "Renyi", entropy_par = 5, cp = 0, n_cores = 1, weights = NULL, cost = NULL, 
                        class_th = "equal", overfit = "leafcut", cf = 0.25)

PrintTree(Tree5) # compared to Tree1 one more observation is classified correctly
Tree5_pred <- PredictTree(Tree5, iris)
confusionMatrix( Tree5_pred$Class, iris$Species )

# Simulation 6: Original dataset, two parameter Sharma-Mittal entropy with q = 2, r = 1.5
Tree6 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, min_obs = 5, 
                        type = "Sharma-Mittal", entropy_par = c(3, 3), cp = 0, n_cores = 1, weights = NULL, 
                        cost = NULL, class_th = "equal", overfit = "none", cf = 0.25)

PrintTree(Tree6)
Tree6_pred <- PredictTree(Tree6, iris)
confusionMatrix( Tree6_pred$Class, iris$Species )

# Simulation 7: Original dataset, pre-pruning based on the cp parameter
Tree7 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, min_obs = 5, 
                        type = "Shannon", entropy_par = 1, cp = 0.45, n_cores = 1, weights = NULL, 
                        cost = NULL, class_th = "equal", overfit = "none", cf = 0.25)

PrintTree(Tree7)
Tree7_pred <- PredictTree(Tree7, iris)
confusionMatrix( Tree7_pred$Class, iris$Species )

# Simulation 8: Original dataset, post-pruning based on the cf parameter
Tree8 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, min_obs = 5, 
                        type = "Shannon", entropy_par = 1, cp = 0, n_cores = 1, weights = NULL, 
                        cost = NULL, class_th = "equal", overfit = "prune", cf = 0.25)

PrintTree(Tree8)
Tree8_pred <- PredictTree(Tree8, iris)
confusionMatrix( Tree8_pred$Class, iris$Species )

# Simulation 9: Original dataset, Shannon entropy weighted by cost matrix, tuned thresholds
Tree9 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, min_obs = 5, 
                        type = "Shannon", entropy_par = 1, cp = 0, n_cores = 1, weights = NULL, 
                        cost = class_cost_mult, class_th = "tuned", overfit = "leafcut", cf = 0.25)

PrintTree(Tree9)
Tree9_pred <- PredictTree(Tree9, iris)
confusionMatrix( Tree9_pred$Class, iris$Species )

# Simulation 10: Binary classification, Tsallis, only numeric attributes, parallel processing
Tree10 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris_2)[-ncol(iris_2)], data = iris_2, depth = 5, 
                         min_obs = 5, type = "Tsallis", entropy_par = 0.5, cp = 0, n_cores = 10, weights = NULL, 
                         cost = NULL, class_th = "equal", overfit = "leafcut", cf = 0.25) 

PrintTree(Tree10)
Tree10_pred <- PredictTree(Tree10, iris_2)
confusionMatrix( Tree10_pred$Class, iris_2$Species, positive = "1" )

# Simulation 11: Binary classification, Tsallis, with one unordered factor attribute, theoretical thresholds, adding cost matrix, parallel processing
Tree11 <- ImbTreeEntropy(Y_name = "Species", X_names = colnames(iris_3)[-ncol(iris_3)], data = iris_3, depth = 5, 
                         min_obs = 5, type = "Tsallis", entropy_par = 0.5, cp = 0, n_cores = 10, weights = NULL, 
                         cost = class_cost_bin, class_th = "theoretical", overfit = "leafcut", cf = 0.25) 

PrintTree(Tree11)
Tree11_pred <- PredictTree(Tree11, iris_3)
confusionMatrix( Tree11_pred$Class, iris_3$Species, positive = "1" )

# Simulation 12: Interactive learning, original dataset, based on the probability peaks, 1 means that the whole tree is built based on the expert decision,
# top 4 splits on attribute level, default prunning based on the leaf cut

# Choosing sequence: 4, 3, 2, 1, 1
Tree12 <- ImbTreeEntropyInter(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, 
                              min_obs = 5, type = "Shannon", entropy_par = 1, cp = 0, n_cores = 1, weights = NULL, 
                              cost = NULL, class_th = "equal", overfit = "leafcut", cf = 0.25, 
                              amb_prob = 1, top_split = 4, var_lev = T, amb_class = NULL, amb_class_freq = NULL ) 

PrintTreeInter(Tree12)
Tree12_pred <- PredictTree(Tree12, iris)
confusionMatrix( Tree12_pred$Class, iris$Species )

# Simulation 13: Interactive learning, original dataset, based on the probability peaks, 
# top 4 splits on the attribute level, prunning based on the cp

# Choosing sequence: 4, 3, 3, 2
Tree13 <- ImbTreeEntropyInter(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, 
                              min_obs = 5, type = "Shannon", entropy_par = 1, cp = 0.1, n_cores = 1, weights = NULL, 
                              cost = NULL, class_th = "equal", overfit = "leafcut", cf = 0.25, 
                              amb_prob = 1, top_split = 4, var_lev = T, amb_class = NULL, amb_class_freq = NULL ) 

PrintTreeInter(Tree13)
Tree13_pred <- PredictTree(Tree13, iris)
confusionMatrix( Tree13_pred$Class, iris$Species )

# Simulation 14: Interactive learning, original dataset, based on the class frequencies per node, 0 means that the whole tree is built based on the expert decision
# top 4 splits on the for each split of the attribute, default prunning based on the leaf cut, 
# desired classes (versicolor, virginica) with the frequencies (0.5,0.1)

# Choosing sequence: 3, 2, 4
Tree14 <- ImbTreeEntropyInter(Y_name = "Species", X_names = colnames(iris)[-ncol(iris)], data = iris, depth = 5, 
                              min_obs = 5, type = "Shannon", entropy_par = 1, cp = 0, n_cores = 1, weights = NULL, 
                              cost = NULL, class_th = "equal", overfit = "leafcut", cf = 0.25, amb_prob = 1, top_split = 4, 
                              var_lev = F, amb_class = c("versicolor", "virginica"), amb_class_freq = c(0.5,0.1) ) 

PrintTreeInter(Tree14)
Tree14_pred <- PredictTree(Tree14, iris)
confusionMatrix( Tree14_pred$Class, iris$Species )

# Simulation 15: Extracting rules
ExtractRules(Tree1)

Releases

No releases published

Packages

No packages published

Languages

  • R 93.0%
  • C++ 7.0%