From 740357c278c8f751003b0c9a36a06f7428db74e8 Mon Sep 17 00:00:00 2001 From: Srikanth K S Date: Mon, 10 Jun 2024 14:50:14 +0530 Subject: [PATCH] augment and all (#29) --- .DS_Store | Bin 8196 -> 8196 bytes DESCRIPTION | 10 +- NAMESPACE | 25 +- R/augment.R | 466 +++++++++++ R/c5.R | 329 -------- R/cubist.R | 321 -------- R/generic.R | 11 - R/globals.R | 34 - R/output_formats.R | 44 -- R/package.R | 65 +- R/party.R | 176 ----- R/rpart.R | 131 ---- R/rule_translators.R | 48 -- R/ruleclasses.R | 348 --------- R/rulelist.R | 492 ++++++++++++ R/tidy.R | 992 ++++++++++++++++++++++++ R/utils.R | 209 ++++- R/varSpec.R | 100 --- man/.DS_Store | Bin 0 -> 6148 bytes man/addBackquotes.Rd | 1 + man/as_rulelist.Rd | 29 + man/as_rulelist.data.frame.Rd | 48 ++ man/augment.Rd | 24 + man/augment.rulelist.Rd | 176 +++++ man/augment_class_keys.Rd | 13 + man/augment_class_no_keys.Rd | 13 + man/augment_regr_keys.Rd | 13 + man/augment_regr_no_keys.Rd | 13 + man/convert_rule_flavor.Rd | 9 +- man/package_tidyrules.Rd | 18 +- man/positionSpaceOutsideSinglequotes.Rd | 3 +- man/predict.rulelist.Rd | 56 +- man/predict.ruleset.Rd | 51 -- man/predict_all_nokeys_rulelist.Rd | 20 + man/predict_all_rulelist.Rd | 20 + man/predict_core.Rd | 48 -- man/predict_nokeys_rulelist.Rd | 20 + man/predict_rulelist.Rd | 20 + man/print.rulelist.Rd | 22 +- man/print.ruleset.Rd | 19 - man/reexports.Rd | 16 - man/removeEmptyLines.Rd | 1 + man/rulelist.Rd | 90 +++ man/set_keys.Rd | 49 ++ man/strHead.Rd | 1 + man/strReplaceReduce.Rd | 1 + man/strSplitSingle.Rd | 1 + man/strTail.Rd | 1 + man/tidy.C5.0.Rd | 43 +- man/tidy.Rd | 29 + man/tidy.constparty.Rd | 53 +- man/tidy.cubist.Rd | 42 +- man/tidy.rpart.Rd | 51 +- man/to_sql_case.Rd | 40 + man/varSpec.Rd | 7 +- tests/testthat/test-c5.R | 2 +- tests/testthat/test-cubist.R | 12 +- tests/testthat/test-party.R | 6 +- tests/testthat/test-rpart.R | 10 +- tests/testthat/test-ruleset.R | 27 - 60 files changed, 3083 insertions(+), 1836 deletions(-) create mode 100644 R/augment.R delete mode 100644 R/c5.R delete mode 100644 R/cubist.R delete mode 100644 R/generic.R delete mode 100644 R/globals.R delete mode 100644 R/output_formats.R delete mode 100644 R/party.R delete mode 100644 R/rpart.R delete mode 100644 R/rule_translators.R delete mode 100644 R/ruleclasses.R create mode 100644 R/rulelist.R create mode 100644 R/tidy.R delete mode 100644 R/varSpec.R create mode 100644 man/.DS_Store create mode 100644 man/as_rulelist.Rd create mode 100644 man/as_rulelist.data.frame.Rd create mode 100644 man/augment.Rd create mode 100644 man/augment.rulelist.Rd create mode 100644 man/augment_class_keys.Rd create mode 100644 man/augment_class_no_keys.Rd create mode 100644 man/augment_regr_keys.Rd create mode 100644 man/augment_regr_no_keys.Rd delete mode 100644 man/predict.ruleset.Rd create mode 100644 man/predict_all_nokeys_rulelist.Rd create mode 100644 man/predict_all_rulelist.Rd delete mode 100644 man/predict_core.Rd create mode 100644 man/predict_nokeys_rulelist.Rd create mode 100644 man/predict_rulelist.Rd delete mode 100644 man/print.ruleset.Rd delete mode 100644 man/reexports.Rd create mode 100644 man/rulelist.Rd create mode 100644 man/set_keys.Rd create mode 100644 man/tidy.Rd create mode 100644 man/to_sql_case.Rd delete mode 100644 tests/testthat/test-ruleset.R diff --git a/.DS_Store b/.DS_Store index 3aa9b2a65a2831c0832e1a1a25dc8dc1d085a0d2..99063433d20788fa3495cc6188c6f8eff9d6fbc9 100644 GIT binary patch delta 320 zcmZp1XmOa}¥U^hRb!ekzSCf%g+;)0}{{3Hejh8;--Ihn;J1_sv{nV4Bv+1NQa zIJh}tgER8WgG&-iN{gK)dkA{_aB_0S3rJK~n;4krD3}@;*Xk%#TN)YYD3}f?`aMH5}82JRTYXk#B-x_WJ diff --git a/DESCRIPTION b/DESCRIPTION index be6bb6e..4f1daf0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: tidyrules Type: Package Title: Obtain Rules from Rule Based Models as Tidy Dataframe -Version: 0.2.3 +Version: 0.2.4 Authors@R: c( person("Srikanth", "Komala Sheshachala", email = "sri.teach@gmail.com", role = c("aut", "cre")), person("Amith Kumar", "Ullur Raghavendra", email = "amith54@gmail.com", role = c("aut")) @@ -18,8 +18,10 @@ Imports: checkmate (>= 2.3.1), tidytable (>= 0.11.0), data.table (>= 1.14.6), - DescTools, - MetricsWeighted + DescTools (>= 0.99.54), + MetricsWeighted (>= 1.0.3), + cli (>= 3.6.2), + glue (>= 1.7.0), Suggests: AmesHousing (>= 0.0.3), dplyr (>= 0.8), @@ -42,3 +44,5 @@ Encoding: UTF-8 LazyData: true RoxygenNote: 7.3.1 VignetteBuilder: knitr +Roxygen: list(markdown = TRUE) + diff --git a/NAMESPACE b/NAMESPACE index 7f71eea..fb12416 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,19 +1,40 @@ # Generated by roxygen2: do not edit by hand +S3method(as_rulelist,data.frame) +S3method(augment,rulelist) S3method(predict,rulelist) -S3method(predict,ruleset) S3method(print,rulelist) -S3method(print,ruleset) S3method(tidy,C5.0) S3method(tidy,constparty) S3method(tidy,cubist) S3method(tidy,rpart) +export(as_rulelist) +export(augment) export(convert_rule_flavor) +export(set_keys) export(tidy) +export(to_sql_case) importFrom(data.table,":=") +importFrom(generics,augment) importFrom(generics,tidy) importFrom(magrittr,"%>%") importFrom(rlang,"%||%") importFrom(stats,IQR) +importFrom(stats,predict) importFrom(stats,weighted.mean) +importFrom(tidytable,all_of) +importFrom(tidytable,arrange) +importFrom(tidytable,distinct) +importFrom(tidytable,drop_na) +importFrom(tidytable,inner_join) +importFrom(tidytable,left_join) +importFrom(tidytable,mutate) +importFrom(tidytable,n) +importFrom(tidytable,nest) +importFrom(tidytable,relocate) +importFrom(tidytable,right_join) +importFrom(tidytable,row_number) +importFrom(tidytable,select) +importFrom(tidytable,summarise) +importFrom(tidytable,unnest) importFrom(utils,data) diff --git a/R/augment.R b/R/augment.R new file mode 100644 index 0000000..81c5d1d --- /dev/null +++ b/R/augment.R @@ -0,0 +1,466 @@ +################################################################################ +# This is the part of the 'tidyrules' R package hosted at +# https://github.com/talegari/tidyrules with GPL-3 license. +################################################################################ + +#' @keywords internal +#' @name augment_class_no_keys +#' @title as the name says +#' @description as the name says +#' not to be exported +augment_class_no_keys = function(x, new_data, y_name, weight = 1L, ...){ + + # raw predictions + pred_df = + x %>% + select(rule_nbr, LHS) %>% + predict(new_data, multiple = TRUE) %>% + unnest(rule_nbr) %>% + select(row_nbr, rule_nbr) + + # new_data with rule_nbr and 'keys' + new_data_with_rule_nbr = + new_data %>% + mutate(row_nbr = row_number()) %>% + mutate(weight__ = local(weight)) %>% + left_join(pred_df, by = "row_nbr") %>% + left_join(select(x, rule_nbr, RHS), by = "rule_nbr") + + prevalence_df = + new_data_with_rule_nbr %>% + summarise(prevalence_0 = sum(weight__, na.rm = TRUE), + .by = eval(y_name) + ) %>% + drop_na(prevalence_0) %>% + mutate(prevalence = prevalence_0 / sum(prevalence_0)) %>% + select(all_of(c(eval(y_name), "prevalence"))) + + aggregatees_df = + new_data_with_rule_nbr %>% + # bring 'prevalence' column + left_join(prevalence_df,by = eval(y_name)) %>% + summarise( + support = sum(weight__, na.rm = TRUE), + confidence = weighted.mean(ifelse(is.na(eval(y_name) == RHS), FALSE, TRUE), + weight__, + na.rm = TRUE + ), + lift = weighted.mean(ifelse(is.na(eval(y_name) == RHS), FALSE, TRUE), + weight__, + na.rm = TRUE + ) / prevalence[1], + .by = rule_nbr + ) %>% + nest(.by = rule_nbr, .key = "augmented_stats") + + # output has all columns of 'tidy' along with 'augment_stats' + res = + x %>% + left_join(aggregatees_df, by = c("rule_nbr")) %>% + arrange(rule_nbr) + + return(res) +} + +#' @keywords internal +#' @name augment_class_keys +#' @title as the name says +#' @description as the name says +#' not to be exported +augment_class_keys = function(x, new_data, y_name, weight = 1L, ...){ + + keys = attr(x, "keys") + + # raw predictions + # columns: row_nbr, rule_nbr, `keys` + pred_df = + x %>% + select(all_of(c("rule_nbr", "LHS", keys))) %>% + predict(new_data, multi = TRUE) %>% + unnest(rule_nbr) # columns: row_nbr, rule_nbr, `keys` + + # new_data with rule_nbr and 'keys' + # columns: row_nbr, rule_nbr, `keys`, RHS, columns of new_data + new_data_with_rule_nbr = + # new_data with row_nbr and weight__ columns + new_data %>% + mutate(row_nbr = row_number()) %>% + mutate(weight__ = weight) %>% + # bring rule_nbr, `keys` (multiple rows per row_nbr might get created) + inner_join(pred_df, by = "row_nbr") %>% + # bring RHS column from tidy object + inner_join(select(x, all_of(c("rule_nbr", keys, "RHS"))), + by = c(keys, "rule_nbr") + ) + + # prevalence per 'keys' + prevalence_df = + new_data_with_rule_nbr %>% + summarise(prevalence_0 = sum(weight__, na.rm = TRUE), + .by = c(keys, eval(y_name)) + ) %>% + drop_na(prevalence_0) %>% + mutate(prevalence = prevalence_0 / sum(prevalence_0, na.rm = TRUE), + .by = c(keys) + ) %>% + select(all_of(c(keys, eval(y_name), "prevalence"))) + + # add aggregates at rule_nbr and 'keys' level + aggregatees_df = + new_data_with_rule_nbr %>% + left_join(prevalence_df, by = c(keys, eval(y_name))) %>% + summarise( + support = sum(weight__, na.rm = TRUE), + confidence = weighted.mean(ifelse(is.na(eval(y_name) == RHS), FALSE, TRUE), + weight__, + na.rm = TRUE + ), + lift = weighted.mean(ifelse(is.na(eval(y_name) == RHS), FALSE, TRUE), + weight__, + na.rm = TRUE + ) / prevalence[1], + ..., + .by = c(keys, "rule_nbr") + ) %>% + nest(.by = c("rule_nbr", keys), .key = "augmented_stats") + + # output has all columns of 'tidy' along with 'augment_stats' + res = + x %>% + left_join(aggregatees_df, by = c("rule_nbr", keys)) %>% + arrange(!!!rlang::syms(c(keys, "rule_nbr"))) %>% + relocate(all_of(c("rule_nbr", keys))) + + return(res) +} + +#' @keywords internal +#' @name augment_regr_no_keys +#' @title as the name says +#' @description as the name says +#' not to be exported +augment_regr_no_keys = function(x, new_data, y_name, weight = 1L, ...){ + + # raw predictions + pred_df = + x %>% + select(rule_nbr, LHS) %>% + predict(new_data, multiple = TRUE) %>% + unnest(rule_nbr) %>% + select(row_nbr, rule_nbr) + + # new_data with rule_nbr and 'keys' + new_data_with_rule_nbr = + new_data %>% + mutate(row_nbr = row_number()) %>% + mutate(weight__ = local(weight)) %>% + left_join(pred_df, by = "row_nbr") %>% + left_join(select(x, rule_nbr, RHS), by = "rule_nbr") + + if (is.character(x$RHS)) { + new_data_with_rule_nbr = + new_data_with_rule_nbr %>% + nest(.by = c("RHS", "row_nbr")) %>% + mutate(RHS = purrr::map2_dbl(RHS, + data, + ~ eval(parse(text = .x), envir = .y) + ) + ) %>% + unnest(data) + } + + aggregatees_df = + new_data_with_rule_nbr %>% + summarise( + support = sum(weight__, na.rm = TRUE), + IQR = DescTools::IQRw(.data[[y_name]], weight__, na.rm = TRUE), + RMSE = MetricsWeighted::rmse(actual = .data[[y_name]], + predicted = RHS, + w = weight__, + na.rm = TRUE + ), + .by = rule_nbr + ) %>% + nest(.by = rule_nbr, .key = "augmented_stats") + + # output has all columns of 'tidy' along with 'augment_stats' + res = + x %>% + left_join(aggregatees_df, by = c("rule_nbr")) %>% + arrange(rule_nbr) + + return(res) +} + +#' @keywords internal +#' @name augment_regr_keys +#' @title as the name says +#' @description as the name says +#' not to be exported +augment_regr_keys = function(x, new_data, y_name, weight = 1L, ...){ + + keys = attr(x, "keys") + + # raw predictions + # columns: row_nbr, rule_nbr, `keys` + pred_df = + x %>% + select(all_of(c("rule_nbr", "LHS", keys))) %>% + predict(new_data, multi = TRUE) %>% + unnest(rule_nbr) # columns: row_nbr, rule_nbr, `keys` + + # new_data with rule_nbr and 'keys' + # columns: row_nbr, rule_nbr, `keys`, RHS, columns of new_data + new_data_with_rule_nbr = + # new_data with row_nbr and weight__ columns + new_data %>% + mutate(row_nbr = row_number()) %>% + mutate(weight__ = weight) %>% + # bring rule_nbr, `keys` (multiple rows per row_nbr might get created) + inner_join(pred_df, by = "row_nbr") %>% + # bring RHS column from tidy object + inner_join(select(x, all_of(c("rule_nbr", keys, "RHS"))), + by = c(keys, "rule_nbr") + ) + + if (is.character(x$RHS)) { + new_data_with_rule_nbr = + new_data_with_rule_nbr %>% + nest(.by = c("RHS", keys, "row_nbr")) %>% + mutate(RHS = purrr::map2_dbl(RHS, + data, + ~ eval(parse(text = .x), envir = .y) + ) + ) %>% + unnest(data) + } + + aggregatees_df = + new_data_with_rule_nbr %>% + summarise( + support = sum(weight__, na.rm = TRUE), + IQR = DescTools::IQRw(.data[[y_name]], weight__, na.rm = TRUE), + RMSE = MetricsWeighted::rmse(actual = .data[[y_name]], + predicted = RHS, + w = weight__, + na.rm = TRUE + ), + .by = c(keys, "rule_nbr") + ) %>% + nest(.by = c("rule_nbr", keys), .key = "augmented_stats") + + # output has all columns of 'tidy' along with 'augment_stats' + res = + x %>% + left_join(aggregatees_df, by = c("rule_nbr", keys)) %>% + arrange(!!!rlang::syms(c(keys, "rule_nbr"))) %>% + relocate(all_of(c("rule_nbr", keys))) + + return(res) +} + +#' @name augment +#' @title `augment` is re-export of [generics::augment] from +#' [tidyrules][package_tidyrules] package +#' @description See [augment.rulelist] +#' @param x A [rulelist] +#' @param ... For methods to use +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist] +#' @importFrom generics augment +#' @family Augment +#' @export +generics::augment + +#' @name augment.rulelist +#' @title Augment a [rulelist] +#' @description `augment` outputs a [rulelist] with an additional column named +#' `augmented_stats` based on summary statistics calculated using `new_data`. +#' @param x A [rulelist] +#' @param new_data (dataframe) with column named `y_name` present +#' @param y_name (string) Column name representing the dependent variable +#' @param weight (numeric, default: 1) Positive weight vector with length equal +#' to one or number of rows of 'new_data' +#' @param ... (expressions) To be send to [tidytable::summarise] for custom +#' aggregations. See examples. +#' @returns A [rulelist] with a new dataframe-column named `augmented_stats`. +#' @details The dataframe-column `augmented_stats` will have these columns +#' corresponding to the `estimation_type`: +#' +#' - For `regression`: `support`, `IQR`, `RMSE` +#' - For `classification`: `support`, `confidence`, `lift` +#' +#' All these metrics are computed in a weighted sense. Arg `weight` is 1 by +#' default. +#' +#' @examples +#' # Examples for augment ------------------------------------------------------ +#' library("magrittr") +#' +#' # C5 ---- +#' att = modeldata::attrition +#' set.seed(100) +#' train_index = sample(c(TRUE, FALSE), nrow(att), replace = TRUE) +#' +#' model_c5 = C50::C5.0(Attrition ~., data = att[train_index, ], rules = TRUE) +#' tidy_c5 = tidy(model_c5) +#' tidy_c5 +#' +#' # augment +#' augmented = augment(tidy_c5, new_data = att[!train_index, ], y_name = "Attrition") +#' +#' augmented %>% +#' tidytable::unnest(augmented_stats, names_sep = "__") %>% +#' tidytable::glimpse() +#' +#' # augment with custom aggregator +#' augmented = +#' augment(tidy_c5, +#' new_data = att[!train_index, ], +#' y_name = "Attrition", +#' output_counts = list(table(Attrition)) +#' ) +#' +#' augmented %>% +#' tidytable::unnest(augmented_stats, names_sep = "__") %>% +#' tidytable::glimpse() +#' +#' # rpart ---- +#' set.seed(100) +#' train_index = sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) +#' +#' model_class_rpart = rpart::rpart(Species ~ ., data = iris[train_index, ]) +#' tidy_class_rpart = tidy(model_class_rpart) +#' tidy_class_rpart +#' +#' model_regr_rpart = rpart::rpart(Sepal.Length ~ ., data = iris[train_index, ]) +#' tidy_regr_rpart = tidy(model_regr_rpart) +#' tidy_regr_rpart +#' +#' #' augment (classification case) +#' augmented = +#' augment(tidy_class_rpart, +#' new_data = iris[!train_index, ], +#' y_name = "Species" +#' ) +#' augmented +#' +#' augmented %>% +#' tidytable::unnest(augmented_stats, names_sep = "__") %>% +#' tidytable::glimpse() +#' +#' #' augment (regression case) +#' augmented = +#' augment(tidy_regr_rpart, +#' new_data = iris[!train_index, ], +#' y_name = "Sepal.Length" +#' ) +#' augmented +#' +#' augmented %>% +#' tidytable::unnest(augmented_stats, names_sep = "__") %>% +#' tidytable::glimpse() +#' +#' # party ---- +#' pen = palmerpenguins::penguins +#' set.seed(100) +#' train_index = sample(c(TRUE, FALSE), nrow(pen), replace = TRUE) +#' +#' model_class_party = partykit::ctree(species ~ ., data = pen[train_index, ]) +#' tidy_class_party = tidy(model_class_party) +#' tidy_class_party +#' +#' model_regr_party = partykit::ctree(bill_length_mm ~ ., data = pen[train_index, ]) +#' tidy_regr_party = tidy(model_regr_party) +#' tidy_regr_party +#' +#' #' augment (classification case) +#' augmented = +#' augment(tidy_class_party, +#' new_data = pen[!train_index, ], +#' y_name = "species" +#' ) +#' augmented +#' +#' augmented %>% +#' tidytable::unnest(augmented_stats, names_sep = "__") %>% +#' tidytable::glimpse() +#' +#' #' augment (regression case) +#' augmented = +#' augment(tidy_regr_party, +#' new_data = tidytable::drop_na(pen[!train_index, ], bill_length_mm), +#' y_name = "bill_length_mm" +#' ) +#' augmented +#' +#' augmented %>% +#' tidytable::unnest(augmented_stats, names_sep = "__") %>% +#' tidytable::glimpse() +#' +#' # cubist ---- +#' att = modeldata::attrition +#' set.seed(100) +#' train_index = sample(c(TRUE, FALSE), nrow(att), replace = TRUE) +#' cols_att = setdiff(colnames(att), c("MonthlyIncome", "Attrition")) +#' +#' model_cubist = Cubist::cubist(x = att[train_index, cols_att], +#' y = att[train_index, "MonthlyIncome"] +#' ) +#' +#' tidy_cubist = tidy(model_cubist) +#' tidy_cubist +#' +#' augmented = +#' augment(tidy_cubist, +#' new_data = att[!train_index, ], +#' y_name = "MonthlyIncome" +#' ) +#' augmented +#' +#' augmented %>% +#' tidytable::unnest(augmented_stats, names_sep = "__") %>% +#' tidytable::glimpse() +#' +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist] +#' @family Augment +#' @export +augment.rulelist = function(x, new_data, y_name, weight = 1L, ...){ + + checkmate::assert_string(y_name) + checkmate::assert_data_frame(new_data) + checkmate::assert_true(y_name %in% colnames(new_data)) + checkmate::assert_numeric(weight, + lower = 1e-8, + finite = TRUE, + any.missing = FALSE + ) + checkmate::assert_true(length(weight) %in% c(1, nrow(new_data))) + checkmate::assert_false(anyNA(new_data[[y_name]])) + + + estimation_type = attr(x, "estimation_type") + keys = attr(x, "keys") + + if (is.null(keys)) { + if (estimation_type == "classification"){ + res = augment_class_no_keys(x, new_data, y_name, weight, ...) + } else if (estimation_type == "regression") { + res = augment_regr_no_keys(x, new_data, y_name, weight, ...) + } else { + rlang::abort("unknown 'estimation_type'") + } + + } else { + + if (estimation_type == "classification"){ + res = augment_class_keys(x, new_data, y_name, weight, ...) + } else if (estimation_type == "regression") { + res = augment_regr_keys(x, new_data, y_name, weight, ...) + } else { + rlang::abort("unknown 'estimation_type'") + } + } + + attr(res, "data") = new_data + return(res) +} diff --git a/R/c5.R b/R/c5.R deleted file mode 100644 index cb19ef5..0000000 --- a/R/c5.R +++ /dev/null @@ -1,329 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -#' @name tidy.C5.0 -#' @title Obtain rules as rulelist/tiydtable from a C5.0 model -#' @description Each row corresponds to a rule per trial_nbr -#' @param x C5 model fitted with `rules = TRUE` -#' @param ... Other arguments (See details) -#' @return A rulelist/tidytable where each row corresponds to a rule. -#' The columns are: rule_nbr, trial_nbr, LHS, RHS, support, confidence, lift -#' @details -#' -#' Optional named arguments: -#' -#' \itemize{ -#' -#' \item laplace(flag, default: TRUE) is supported. This computes confidence -#' with laplace correction as documented under 'Rulesets' here: [C5 -#' doc](https://www.rulequest.com/see5-unix.html). -#' } -#' -#' @examples -#' c5_model = C50::C5.0(Attrition ~., data = modeldata::attrition, rules = TRUE) -#' summary(c5_model) -#' tidy(c5_model) -#' @export - -tidy.C5.0 = function(x, ...){ - - #### checks ################################################################# - - arguments = list(...) - arguments[["laplace"]] = arguments[["laplace"]] %||% TRUE - - # for magrittr dot - . = NULL - - if (!x[["rbm"]]){ - rlang::abort("Model should be built using `rules = TRUE` argument.") - } - - # output of the model - output = x[["output"]] - - # get variable specification - var_spec = varSpec(x) - variable_names = var_spec[["variable"]] - col_classes = var_spec[["type"]] - names(col_classes) = variable_names - - # throw error if there is consecutive spaces - # output from the model squishes the spaces - if (any(stringr::str_count(variable_names, " ") > 0)){ - rlang::abort("Variable names should not two or more consecutive spaces.") - } - - #### core logic ############################################################## - # extract rules part - spl = - output %>% - stringr::str_replace_all("\t", "") %>% # remove tab spaces - stringr::str_replace_all("\n ", "") %>% # handle multiline lineitems - strSplitSingle("\n") # split along newlines - - # detect where the rules start - start_rules_position = min(which(stringr::str_detect(spl, "^Rule "))) - - # detect where the rules end - end_rules_position = - stringr::str_detect(spl, "^Evaluation on training data") %>% - which() %>% - magrittr::subtract(1) %>% - min() - - # get the rules part - spl = spl[start_rules_position:end_rules_position] %>% - stringr::str_squish() %>% - removeEmptyLines() - - ## get raw rules by splitting - - # every rule starts with 'Rule' - cuts = which(stringr::str_detect(spl, "^Rule ")) - # end of rule is a line before the start of next rule - cuts2 = which(stringr::str_detect(spl, "^\\-\\> ")) - - # split rules - rules_raw = purrr::map2(cuts, cuts2, function(x, y) spl[x:y]) - - ## function to get a parsable rule from a raw rule - getRules = function(single_raw_rule){ - - # empty list container - rule = list() - - # get stats from first line ---- - first_line = single_raw_rule[1] - - # A typical first line looks like: - #************************************************** - # "Rule 0/1: (521/30, lift 1.1)", ":" - #************************************************** - - index = strSplitSingle(first_line, ":") %>% - magrittr::extract(1) %>% - strSplitSingle("\\s") %>% - magrittr::extract(2) %>% - strSplitSingle("/") - - if (length(index) == 2){ - rule[["rule_number"]] = as.integer(index[2]) - rule[["trial_number"]] = as.integer(index[1]) + 1L - } else { - rule[["rule_number"]] = as.integer(index) - rule[["trial_number"]] = 1L - } - - stats = - strSplitSingle(first_line, ":") %>% - magrittr::extract(2) %>% - strSplitSingle("\\(") %>% - magrittr::extract(2) %>% - strSplitSingle("\\)") %>% - magrittr::extract(1) %>% - strSplitSingle(",") %>% - stringr::str_squish() - - support_confidence = strSplitSingle(stats[1], "/") - if (length(support_confidence) > 1){ - - # extract support - rule[["support"]] = as.integer(support_confidence[1]) - - # compute confidence (not extract) - if (arguments[["laplace"]]){ - - # C5 doc computes confidence using laplace correction - # (n-m+1)/(n+2) - # n: number of obs in leaf - # m: number of musclassifications among n - rule[["confidence"]] = - rule[["support"]] %>% - magrittr::subtract(as.integer(support_confidence[2])) %>% - magrittr::add(1) %>% - magrittr::divide_by(rule[["support"]] + 2) %>% - round(4) - - } else { - - # without laplace correction - # simply: (n-m)/n - rule[["confidence"]] = - rule[["support"]] %>% - magrittr::subtract(as.integer(support_confidence[2])) %>% - magrittr::divide_by(rule[["support"]]) %>% - round(4) - } - - } else { - - rule[["support"]] = as.integer(support_confidence) - # see comments for laplace above - if (arguments[["laplace"]]){ - rule[["confidence"]] = (rule[["support"]] + 1)/(rule[["support"]] + 2) - } else{ - rule[["confidence"]] = 1 - } - } - - rule[["lift"]] = - strSplitSingle(stats[2], "\\s") %>% - magrittr::extract(2) %>% - as.numeric() - - # curate a single line item of the rule ---- - line_item_curator = function(line_item){ - - # in unforeseen cases just return the rule string - # let the parsing test catch it - line_item_rule = line_item - - # 'in' separator for a single line item of rule - # ex1: JobInvolvement in [Low-Medium] for ordered factors - # ex2: JobRole in {Laboratory_Technician, Sales_Representative} - if (stringr::str_detect(line_item, "\\sin\\s")){ - split_line_item = strSplitSingle(line_item, "\\sin\\s") - lhs_line_item = split_line_item[1] - rhs_line_item = split_line_item[2] - - # unordered factor case - if (stringr::str_detect(line_item, "\\{")){ - rhs_line_item = - rhs_line_item %>% - strHead(-1) %>% # remove quotes - strTail(-1) %>% - strSplitSingle(",") %>% # split the list by comma - stringr::str_trim() %>% # trim if any - # add quotes around levels - purrr::map_chr(function(x) stringr::str_c("'", x, "'")) %>% - stringr::str_c(collapse = ", ") %>% # bind with comma - stringr::str_c("c(", ., ")") # create 'c' structure - - line_item_rule = stringr::str_c(lhs_line_item, - " %in% ", - rhs_line_item - ) - } - - # unordered factor case - if (stringr::str_detect(line_item, "\\[")){ - rhs_line_item = - rhs_line_item %>% - strHead(-1) %>% - strTail(-1) - - # more than one hyphen means some factor level has hyphen - if (stringr::str_count(rhs_line_item, "-") > 1){ - rlang::abort("factor levels cannot have '-'.") - } - - rhs_line_item = rhs_line_item %>% - strSplitSingle("-") %>% - stringr::str_squish() # in case there is space - - # get the levels of the variable - levels = - var_spec[var_spec[["variable"]] == lhs_line_item, ] %>% - as.list() %>% - magrittr::extract2("levels") %>% - magrittr::extract2(1) - - # get all levels between start and end level - start_level = which(levels == rhs_line_item[1]) - end_level = which(levels == rhs_line_item[2]) - - # construct RHS of the line item - rhs_line_item = - levels[start_level:end_level] %>% - stringr::str_c("'", ., "'") %>% - stringr::str_c(collapse = ", ") %>% - stringr::str_c("c(", ., ")") - - # complete line rule - line_item_rule = stringr::str_c(lhs_line_item, - " %in% ", - rhs_line_item - ) - } - } - - # handle '=' case - # ex: MaritalStatus = Single - contains_equals = stringr::str_detect(line_item, " = ") - if (contains_equals){ - - sub_rule = - strSplitSingle(line_item, "=") %>% - stringr::str_trim() - - the_class = col_classes[[ sub_rule[1] ]] - - # quote if non-numeric - if (!(the_class %in% c("numeric", "integer"))){ - sub_rule[2] = stringr::str_c("'", sub_rule[2], "'") - } - - line_item_rule = stringr::str_c(sub_rule, collapse = " == ") - - } - - line_item_rule = paste0("( ", line_item_rule, " )") - return(line_item_rule) - } - - # create LHS and RHS ---- - rule[["LHS"]] = - single_raw_rule %>% - utils::tail(-1) %>% # remove first stats line - utils::head(-1) %>% # remove the RHS line - purrr::map(line_item_curator) %>% # get clean rule lines - stringr::str_c(collapse = " & ") # concat them with '&' - - rule[["RHS"]] = - single_raw_rule %>% - utils::tail(1) %>% # get the RHS line - stringr::str_squish() %>% # remove multispaces - strSplitSingle("\\s") %>% # split by space - magrittr::extract(3) # extract the RHS name - - # return cleaned rule ---- - return(rule) - } - - # apply rule tidying for each rule and return tibble - res = - purrr::map(rules_raw, getRules) %>% - purrr::transpose() %>% - purrr::simplify_all() %>% - tidytable::as_tidytable() - - #### finalize output ######################################################### - # replace variable names with spaces within backquotes - for (i in 1:length(variable_names)){ - res[["LHS"]] = - stringr::str_replace_all(res[["LHS"]], - variable_names[i], - addBackquotes(variable_names[i]) - ) - } - - #### return ################################################################## - res = - res %>% - tidytable::select(rule_nbr = rule_number, trial_nbr = trial_number, - LHS, RHS, - support, confidence, lift - ) - - class(res) = c("rulelist", class(res)) - - attr(res, "keys") = "trial_nbr" - attr(res, "model_type") = "C5" - attr(res, "estimation_type") = "classification" - - return(res) -} - diff --git a/R/cubist.R b/R/cubist.R deleted file mode 100644 index 30d3528..0000000 --- a/R/cubist.R +++ /dev/null @@ -1,321 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -#' @name tidy.cubist -#' @title Obtain rules as a ruleset/tidytable from a cubist model -#' @description Each row corresponds to a rule per committee. -#' @param x Cubist model -#' @param ... Other arguments (currently unused) -#' @return A ruleset/tidytable where each row corresponds to a rule. The columns -#' are: rule_nbr, committee, LHS, RHS, support, mean, min, max, error -#' @details When col_classes argument is missing, an educated guess is made -#' about class by parsing the RHS of sub-rule. This might sometimes not lead -#' to a parsable rule. -#' @examples -#' data("attrition", package = "modeldata") -#' cols_att = setdiff(colnames(attrition), c("MonthlyIncome", "Attrition")) -#' -#' cb_att = Cubist::cubist(x = attrition[, cols_att], -#' y = attrition[["MonthlyIncome"]] -#' ) -#' summary(cb_att) -#' tidy(cb_att) -#' @export - -tidy.cubist = function(x, ...){ - - #### core rule extraction #################################################### - # output from the model - output = x[["output"]] - - # get variable specification - var_spec = varSpec(x) - variable_names = var_spec[["variable"]] - col_classes = var_spec[["type"]] - names(col_classes) = variable_names - - # throw error if there is consecutive spaces - # output from the model squishes the spaces - if(any(stringr::str_count(variable_names, " ") > 0)){ - rlang::abort("Variable names should not two or more consecutive spaces.") - } - - variable_names_with_ = - stringr::str_replace_all(variable_names, "\\s", "_") - - # split by newline and remove emptylines - lev_1 = - x[["output"]] %>% - strSplitSingle("\\n") %>% - removeEmptyLines() - - # remove everything from 'Evaluation on training data' onwards - evalLine = stringr::str_which(lev_1, "^Evaluation on training data") - lev_2 = - lev_1[-(evalLine:length(lev_1))] %>% - stringr::str_subset("^(?!Model).*$") - - # detect starts and ends of rules - rule_starts = stringr::str_which(stringr::str_trim(lev_2), "^Rule\\s") - # end of a rule is a line before the next rule start - rule_ends = c(utils::tail(rule_starts, -1) - 1, length(lev_2)) - - # create a rule list for cubist - get_rules_cubist = function(single_raw_rule){ - - # a raw rule looks like this: - # - # [1] " Rule 7: [87 cases, mean 15824.0, range 12061 to 17924, est err 694.4]" - # [2] " if" - # [3] "\tJobLevel <= 4" - # [4] "\tJobRole in {Manager, Research_Director}" - # [5] "\tTotalWorkingYears > 14" - # [6] " then" - # [7] "\toutcome = 4166.4 + 3467 JobLevel - 23 Age - 0.011 MonthlyRate" - - # example with equal sign inside - # Rule 1/14: [35 cases, mean 5.364152, range 4.963788 to 5.521399, est err 0.039525] - # - # if - # Year_Built > 1952 - # Bsmt_Exposure in {Av, Mn, No, No_Basement} - # Gr_Liv_Area <= 1692 - # Kitchen_Qual = Excellent - # then - # outcome = 2.533961 + 0.000252 Gr_Liv_Area + 0.0025 Year_Built - # + 0.002 Year_Remod_Add + 0.000105 Garage_Area - # - 0.00054 Lot_Frontage - 3.8e-05 Bsmt_Unf_SF - # + 2.4e-05 Total_Bsmt_SF + 7e-07 Lot_Area - 0.005 Bedroom_AbvGr - # + 0.003 Garage_Cars + 0.003 Fireplaces + 0.07 Longitude - # + 0.001 TotRms_AbvGrd - - res = list() - - # locate the position of square bracket and collect stats - firstLine = stringr::str_squish(single_raw_rule[1]) - openingSquareBracketPosition = stringr::str_locate(firstLine, "\\[")[1, 1] - - # All stats are at the begining of the rule - stat = - # between square brackets - stringr::str_sub(firstLine - , openingSquareBracketPosition + 1 - , stringr::str_length(firstLine) - 1 # closing ] bracket - ) %>% - strSplitSingle("\\,") %>% - stringr::str_trim() - - res[["support"]] = stat[1] %>% - strSplitSingle("\\s") %>% - magrittr::extract(1) %>% - as.integer() - - res[["mean"]] = stat[2] %>% - strSplitSingle(" ") %>% - magrittr::extract(2) %>% - as.numeric() - - res[["min"]] = stat[3] %>% - strSplitSingle(" ") %>% - magrittr::extract(2) %>% - as.numeric() - - res[["max"]] = stat[3] %>% - strSplitSingle(" ") %>% - magrittr::extract(4) %>% - as.numeric() - - res[["error"]] = stat[4] %>% - strSplitSingle(" ") %>% - magrittr::extract(3) %>% - as.numeric() - - # is if-then missing (only outcome is there) - if_exists = any(stringr::str_trim(single_raw_rule) == "if") - - if (if_exists){ - # get LHS - btw_if_then = - seq(which(stringr::str_trim(single_raw_rule) == "if") + 1, - which(stringr::str_trim(single_raw_rule) == "then") - 1 - ) - - # unclean LHS strings, one condition per string - lhsStrings = - single_raw_rule[btw_if_then] %>% - stringr::str_replace_all("\\t", "\\\\n") %>% - stringr::str_trim() %>% - stringr::str_c(collapse = " ") %>% - strSplitSingle("\\\\n") %>% - removeEmptyLines() %>% - stringr::str_trim() - - # function to get the one clean rule string - getRuleString = function(string){ - - # to avoid CRAN notes - . = NULL - - # if there is ' in {' in the string - if(stringr::str_detect(string, "\\sin\\s\\{")){ - - # split with ' in {' - var_lvls = strSplitSingle(string, "\\sin\\s\\{") - - # get the contents inside curly braces - lvls = - var_lvls[2] %>% - # omit the closing curly bracket - strHead(-1) %>% - strSplitSingle(",") %>% - stringr::str_trim() %>% - purrr::map_chr(function(x) stringr::str_c("'", x, "'")) %>% - stringr::str_c(collapse = ", ") %>% # note the space next to comma - stringr::str_c("c(", ., ")") - - # get the variable - var = stringr::str_trim(var_lvls[1]) - rs = stringr::str_c(var, " %in% ", lvls) - - } else { - - # handle '=' case - contains_equals = stringr::str_detect(string, " = ") - - if (contains_equals){ - - sub_rule = strSplitSingle(string, "=") %>% - stringr::str_trim() - - if(!(col_classes[sub_rule[1]] == "numeric")){ - sub_rule[2] = stringr::str_c("'", sub_rule[2], "'") - } - - rs = stringr::str_c(sub_rule, collapse = " == ") - } else { - # nothing to do - rs = string - } - - } # end of handle '=' case - - return(rs) - - } - - # clean up LHS as string - res[["LHS"]] = - purrr::map_chr(lhsStrings, getRuleString) %>% - stringr::str_c("( ", ., " )") %>% - stringr::str_c(collapse = " & ") # note spaces next to AND - } else { - - res[["LHS"]] = NA - - } - - # get RHS - # then might not exist: still retaining old name 'afterThen' - if (if_exists){ - afterThen = seq(which(stringr::str_trim(single_raw_rule) == "then") + 1, - length(single_raw_rule) - ) - } else { - afterThen = seq( - which(stringr::str_detect(stringr::str_trim(single_raw_rule), - "^outcome" - ) - ), - length(single_raw_rule) - ) - } - - # handle brackets around signs - res[["RHS"]] = - single_raw_rule[afterThen] %>% - stringr::str_replace_all("\\t", "") %>% - stringr::str_trim() %>% - stringr::str_c(collapse = " ") %>% - stringr::str_squish() %>% - stringr::str_replace("outcome = ", "") %>% - # remove spaces around +- signs - stringr::str_replace_all("\\s\\+\\s", "++") %>% - stringr::str_replace_all("\\s\\-\\s", "--") %>% - strReplaceReduce(variable_names, variable_names_with_) %>% - stringr::str_replace_all("\\s", " * ") %>% - stringr::str_replace_all("\\+\\+", ") + (") %>% - stringr::str_replace_all("\\-\\-", ") - (") - - # quotes aroud each addenum - res[["RHS"]] = - stringr::str_c("(", res[["RHS"]], ")") %>% - # honour negative intercept - stringr::str_replace("\\(\\)\\s\\-\\s\\(", "(-") - - return(res) - } - - # see if rules have commitees and create commitees vector - rule_number_splits = - stringr::str_split(stringr::str_trim(lev_2)[rule_starts], ":") %>% - purrr::map_chr(function(x) x[[1]]) %>% - stringr::str_split("\\s") %>% - purrr::map_chr(function(x) x[[2]]) %>% - stringr::str_split("/") %>% - simplify2array() %>% - as.integer() - - if (length(rule_number_splits) > length(rule_starts)){ - committees = - rule_number_splits[seq(1, by = 2, length.out = length(rule_starts))] - } else { - committees = rep(1L, length(rule_starts)) - } - - # create parsable rules from raw rules - res = - purrr::map(1:length(rule_starts), - function(i) lev_2[rule_starts[i]:rule_ends[i]] - ) %>% - purrr::map(get_rules_cubist) %>% - purrr::transpose() %>% - purrr::map(unlist) %>% - tidytable::as_tidytable() - - #### prepare and return ###################################################### - # replace variable names with spaces within backquotes - for (i in 1:length(variable_names)){ - res[["LHS"]] = - stringr::str_replace_all(res[["LHS"]], - variable_names[i], - addBackquotes(variable_names[i]) - ) - - res[["RHS"]] = stringr::str_replace_all(res[["RHS"]], - variable_names_with_[i], - addBackquotes(variable_names[i]) - ) - } - - res = - res %>% - tidytable::mutate(committee = local(committees)) %>% - tidytable::arrange(desc(support), .by = committee) %>% - tidytable::mutate(rule_nbr = tidytable::row_number(), .by = committee) - - res = res[, c("rule_nbr", "committee", - "LHS", "RHS", - "support", "mean", "min", "max", "error" - ) - ] - - class(res) = c("ruleset", class(res)) - - attr(res, "keys") = "committee" - attr(res, "model_type") = "cubist" - attr(res, "estimation_type") = "regression" - - return(res) -} diff --git a/R/generic.R b/R/generic.R deleted file mode 100644 index 08b6add..0000000 --- a/R/generic.R +++ /dev/null @@ -1,11 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -# dev: generic 'tidy' is now imported from 'generics' package -# 'tidyRules' generic is no longer supported. - -#' @importFrom generics tidy -#' @export -generics::tidy diff --git a/R/globals.R b/R/globals.R deleted file mode 100644 index 46a2da6..0000000 --- a/R/globals.R +++ /dev/null @@ -1,34 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -utils::globalVariables(c(".", - "LHS", - "RHS", - "committee", - "desc", - "dev", - "lift", - "n", - "predict_class", - "rule_nbr", - "rule_number", - "support", - "trial_number", - "yval", - "confidence", - "rn__", - "row_nbr", - "pref__", - "data", - "weight", - "response", - "terminal_node_id", - "sum_weight", - "prevalence", - "winning_response", - "average", - "RMSE" - ) - ) \ No newline at end of file diff --git a/R/output_formats.R b/R/output_formats.R deleted file mode 100644 index 1dd4b2c..0000000 --- a/R/output_formats.R +++ /dev/null @@ -1,44 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -################################################################################ -# description of the cubist return object -################################################################################ -# data: rowwise concatenated data -# names: new line separated variables, type and levels -# caseWeights: flag -# model: has rules -# output: has rules which can be used for printing with writeLines -# control: list of control values -# committees: Number of committees -# maxd: some related to seed? -# dims: dimensions of the data -# splits: splits of a rule as a dataframe -# usage: Variables used in conditions and the model as a dataframe -# call: call -# coefficients: matrix of rule X varibale form, intercept included -# vars: List of all and used variables - -################################################################################ -# description of the C5 return object -################################################################################ -# names: new line separated variables, type and levels -# cost: Cost if provided -# costMatrix: if provided -# caseWeights: flag -# control: list of control values -# trials: named vector with reuested and actual -# rbm: flag whether it is a rule based model -# boostingResults: a parsed version of the boosting table(s) shown in the output (have not checked this) -# size: n integer vector of the tree/rule size (or sizes in the case of boosting) -# dims: dimensions of the data -# call: call -# levels: levels of the outcome factor variable -# output: has rules which can be used for printing with writeLines -# tree : tree (have not checked this) -# predictors: Names of variables -# rules: Rules in a slightly hard to parse format -# terms: object of class terms and formula -# xlevels: levels of factor/ordered variables \ No newline at end of file diff --git a/R/package.R b/R/package.R index 5a699a3..229845f 100644 --- a/R/package.R +++ b/R/package.R @@ -4,14 +4,75 @@ ################################################################################ #' @name package_tidyrules -#' @title About 'tidyrules' package -#' @description Obtain rules as tidy dataframes +#' @title `tidyrules` +#' @description `tidyrules` package provides a framework to work with decision +#' rules stored as a [rulelist] backed by a tidy dataframe. Rules can be +#' extracted from supported models using [tidy], augmented using validation data +#' by [augment][augment.rulelist], manipulated using standard dataframe +#' operations, (modified) rulelists can be used to [predict][predict.rulelist] +#' on unseen (test) data. Utilities include: Create a rulelist +#' manually ([as_rulelist][as_rulelist.data.frame]), Export a rulelist to SQL +#' ([to_sql_case]) and so on. +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist] #' @importFrom magrittr %>% #' @importFrom rlang %||% #' @importFrom data.table := #' @importFrom utils data #' @importFrom stats IQR #' @importFrom stats weighted.mean +#' @importFrom tidytable select +#' @importFrom tidytable arrange +#' @importFrom tidytable mutate +#' @importFrom tidytable summarise +#' @importFrom tidytable distinct +#' @importFrom tidytable all_of +#' @importFrom tidytable n +#' @importFrom tidytable left_join +#' @importFrom tidytable right_join +#' @importFrom tidytable inner_join +#' @importFrom tidytable inner_join +#' @importFrom tidytable nest +#' @importFrom tidytable unnest +#' @importFrom tidytable row_number +#' @importFrom tidytable drop_na +#' @importFrom tidytable relocate "_PACKAGE" list.rules.party = getFromNamespace(".list.rules.party", "partykit") + +utils::globalVariables(c(".", + "LHS", + "RHS", + "committee", + "desc", + "dev", + "lift", + "n", + "predict_class", + "rule_nbr", + "rule_number", + "support", + "trial_number", + "yval", + "confidence", + "rn__", + "row_nbr", + "pref__", + "data", + "weight", + "response", + "terminal_node_id", + "sum_weight", + "prevalence", + "winning_response", + "average", + "RMSE", + "weight__", + "prevalence_0", + ".data", + "rn_df", + "trial_nbr", + "error" + ) + ) + diff --git a/R/party.R b/R/party.R deleted file mode 100644 index e3697b5..0000000 --- a/R/party.R +++ /dev/null @@ -1,176 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -#' @name tidy.constparty -#' @title Obtain rules as a ruleset/tidytable from a party model -#' @description Each row corresponds to a rule. A rule can be copied into -#' `dplyr::filter` to filter the observations corresponding to a rule -#' @param x party model -#' @param ... Other arguments (currently unused) -#' @details These party models are supported: regression (y is numeric), -#' classification (y is factor) -#' @return A tidytable where each row corresponds to a rule. The columns are: -#' rule_nbr, LHS, RHS, support, confidence (for classification only), lift -#' (for classification only) -#' @examples -#' model_party_cl = partykit::ctree(species ~ .,data = palmerpenguins::penguins) -#' model_party_cl -#' tidy(model_party_cl) -#' -#' model_party_re = partykit::ctree(bill_length_mm ~ ., -#' data = palmerpenguins::penguins -#' ) -#' model_party_re -#' tidy(model_party_re) -#' @export - -tidy.constparty = function(x, ...){ - - ##### assertions and prep #################################################### - arguments = list(...) - - # column names from the x: This will be used at the end to handle the - # variables with a space - col_names = - attr(x$terms, which = "term.labels") %>% - stringr::str_remove_all(pattern = "`") - - # throw error if there are consecutive spaces in the column names - if (any(stringr::str_count(col_names, " ") > 0)){ - rlang::abort( - "Variable names should not have two or more consecutive spaces.") - } - - # detect method using 'fitted' - fitted_df = tidytable::as_tidytable(x$fitted) - colnames(fitted_df) = c("terminal_node_id", "weight", "response") - fitted_df[["terminal_node_id"]] = as.character(fitted_df[["terminal_node_id"]]) - - y_class = class(fitted_df[["response"]]) - if (y_class == "factor") { - type = "classification" - } else if (y_class %in% c("numeric", "integer")) { - type = "regression" - } else { - rlang::inform("tidy supports only classification and regression 'party' models") - rlang::abort("Unsupported party object") - } - - #### core extraction work #################################################### - - # extract rules - raw_rules = list.rules.party(x) - - rules_df = - raw_rules %>% - stringr::str_replace_all(pattern = "\\\"","'") %>% - stringr::str_remove_all(pattern = ", 'NA'") %>% - stringr::str_remove_all(pattern = "'NA',") %>% - stringr::str_remove_all(pattern = "'NA'") %>% - stringr::str_squish() %>% - stringr::str_split(" & ") %>% - purrr::map(~ stringr::str_c("( ", .x, " )")) %>% - purrr::map_chr(~ stringr::str_c(.x, collapse = " & ")) %>% - tidytable::tidytable(LHS = .) %>% - tidytable::mutate(terminal_node_id = names(raw_rules)) - - # create metrics df - if (type == "classification"){ - - terminal_response_df = - fitted_df %>% - tidytable::summarise(sum_weight = sum(weight, na.rm = TRUE), - .by = c(terminal_node_id, response) - ) %>% - tidytable::slice_max(n = 1, - order_by = sum_weight, - by = terminal_node_id, - with_ties = FALSE - ) %>% - tidytable::select(terminal_node_id, - winning_response = response - ) - - prevalence_df = - fitted_df %>% - tidytable::summarise(prevalence = sum(weight, na.rm = TRUE), - .by = response - ) %>% - tidytable::mutate(prevalence = prevalence / sum(prevalence)) %>% - tidytable::select(response, prevalence) - - res = - fitted_df %>% - # bring 'winning_response' column - tidytable::left_join(terminal_response_df, - by = "terminal_node_id" - ) %>% - # bring 'prevalence' column - tidytable::left_join(prevalence_df, - by = c("winning_response" = "response") - ) %>% - tidytable::summarise( - support = sum(weight), - confidence = weighted.mean(response == winning_response, weight, na.rm = TRUE), - lift = weighted.mean(response == winning_response, weight, na.rm = TRUE) / prevalence[1], - RHS = winning_response[1], - .by = terminal_node_id - ) %>% - tidytable::left_join(rules_df, by = "terminal_node_id") %>% - tidytable::arrange(tidytable::desc(confidence)) %>% - tidytable::mutate(., rule_nbr = 1:nrow(.)) %>% - tidytable::select(rule_nbr, LHS, RHS, - support, confidence, lift, - terminal_node_id - ) - - } else if (type == "regression"){ - - res = - fitted_df %>% - tidytable::mutate(average = weighted.mean(response, weight, na.rm = TRUE), - .by = terminal_node_id - ) %>% - tidytable::summarise( - support = sum(weight), - IQR = DescTools::IQRw(response, weight, na.rm = TRUE), - RMSE = MetricsWeighted::rmse(actual = response, - predicted = average, - w = weight, - na.rm = TRUE - ), - average = mean(average), - .by = terminal_node_id - ) %>% - tidytable::left_join(rules_df, by = "terminal_node_id") %>% - tidytable::arrange(tidytable::desc(RMSE)) %>% - tidytable::mutate(., rule_nbr = 1:nrow(.)) %>% - tidytable::select(rule_nbr, LHS, RHS = average, - support, IQR, RMSE, - terminal_node_id - ) - } - - #### finalize output ######################################################### - - # replace variable names with spaces within backquotes - for (i in 1:length(col_names)) { - res[["LHS"]] = - stringr::str_replace_all(res[["LHS"]], - col_names[i], - addBackquotes(col_names[i]) - ) - } - - #### return ################################################################## - - class(res) = c("ruleset", class(res)) - - attr(res, "keys") = NULL - attr(res, "model_type") = "constparty" - attr(res, "estimation_type") = type - - return(res) -} diff --git a/R/rpart.R b/R/rpart.R deleted file mode 100644 index 463323f..0000000 --- a/R/rpart.R +++ /dev/null @@ -1,131 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -#' @name tidy.rpart -#' @title Obtain rules as a ruleset/tidytable from a rpart model -#' @description Each row corresponds to a rule. A rule can be copied into -#' `dplyr::filter` to filter the observations corresponding to a rule -#' @param x rpart model -#' @param ... Other arguments (currently unused) -#' @details NOTE: For rpart rules, one should build the model without -#' \bold{ordered factor} variable. We recommend you to convert \bold{ordered -#' factor} to \bold{factor} or \bold{integer} class. -#' @return A tidytable where each row corresponds to a rule. The columns are: -#' rule_nbr, LHS, RHS, support, confidence (for classification only), lift -#' (for classification only) -#' @examples -#' rpart_class = rpart::rpart(Species ~ .,data = iris) -#' rpart_class -#' tidy(rpart_class) -#' -#' rpart_regr = rpart::rpart(Sepal.Length ~ .,data = iris) -#' rpart_regr -#' tidy(rpart_regr) -#' @export - -tidy.rpart = function(x, ...){ - - ##### assertions and prep #################################################### - arguments = list(...) - - # supported 'rpart' classes - method_rpart = x$method - # classification: class, regression: anova - checkmate::assert_choice(method_rpart, c("class", "anova")) - - # build with y = TRUE - if (is.null(x$y)){ - rlang::abort("rpart model should be built using argument `y = TRUE`.") - } - - # column names from the x: This will be used at the end to handle the - # variables with a space - col_names = - attr(x$terms, which = "term.labels") %>% - stringr::str_remove_all(pattern = "`") - - # throw error if there are consecutive spaces in the column names - if (any(stringr::str_count(col_names, " ") > 0)){ - rlang::abort( - "Variable names should not have two or more consecutive spaces.") - } - - #### core extraction work #################################################### - - # convert to class "party" - party_obj = partykit::as.party(x) - - # extract rules - rules = - list.rules.party(party_obj) %>% - stringr::str_replace_all(pattern = "\\\"","'") %>% - stringr::str_remove_all(pattern = ", 'NA'") %>% - stringr::str_remove_all(pattern = "'NA',") %>% - stringr::str_remove_all(pattern = "'NA'") %>% - stringr::str_squish() %>% - stringr::str_split(" & ") %>% - purrr::map(~ stringr::str_c("( ", .x, " )")) %>% - purrr::map_chr(~ stringr::str_c(.x, collapse = " & ")) - - terminal_nodes = partykit::nodeids(party_obj, terminal = TRUE) - - # create metrics df - if (method_rpart == "class"){ - prevalence = as.numeric(prop.table(table(x$y))) - - res = - x$frame[terminal_nodes, c("n", "dev", "yval")] %>% - tidytable::mutate(confidence = (n + 1 - dev) / (n + 2)) %>% - tidytable::rename(support = n, predict_class = yval) %>% - tidytable::mutate(RHS = attr(x, "ylevels")[predict_class]) %>% - tidytable::mutate(prevalence = prevalence[predict_class]) %>% - tidytable::mutate(lift = confidence / prevalence) %>% - tidytable::mutate(LHS = rules) - - } else if (method_rpart == "anova"){ - res = - x$frame[terminal_nodes, c("n","yval")] %>% - tidytable::rename(support = n, RHS = yval) %>% - tidytable::mutate(LHS = rules) - } - - #### finalize output ######################################################### - - # replace variable names with spaces within backquotes - for (i in 1:length(col_names)) { - res[["LHS"]] = - stringr::str_replace_all(res[["LHS"]], - col_names[i], - addBackquotes(col_names[i]) - ) - } - - #### return ################################################################## - - res[["rule_nbr"]] = 1:nrow(res) - - if (method_rpart == "class"){ - res = - res %>% - tidytable::select(rule_nbr, LHS, RHS, support, confidence, lift) - - } else if (method_rpart == "anova") { - res = - res %>% - tidytable::select(rule_nbr, LHS, RHS, support) - } - - class(res) = c("ruleset", class(res)) - - attr(res, "keys") = NULL - attr(res, "model_type") = "rpart" - if (method_rpart == "class"){ - attr(res, "estimation_type") = "classification" - } else if (method_rpart == "anova"){ - attr(res, "estimation_type") = "regression" - } - - return(res) -} diff --git a/R/rule_translators.R b/R/rule_translators.R deleted file mode 100644 index e8e0bba..0000000 --- a/R/rule_translators.R +++ /dev/null @@ -1,48 +0,0 @@ -#' @name convert_rule_flavor -#' @title Convert a R parsable rule to python/sql parsable rule -#' @description Convert a R parsable rule to python/sql parsable rule -#' @param rule (chr vector) R parsable rule(s) -#' @param flavor (string) One among: 'python', 'sql' -#' @return (chr vector) of rules -#' @export -convert_rule_flavor = function(rule, flavor){ - - checkmate::assert_character(rule) - checkmate::assert_string(flavor) - flavor = stringr::str_to_lower(flavor) - checkmate::assert_choice(flavor, c("python", "sql")) - - if (flavor == "python"){ - res = - rule %>% - stringr::str_replace_all("\\( ", "") %>% - stringr::str_replace_all(" \\)", "") %>% - - stringr::str_replace_all("%in%", "in") %>% - stringr::str_replace_all("c\\(", "[") %>% - stringr::str_replace_all("\\)", "]") %>% - - stringr::str_replace_all("&", " ) and (") %>% - - stringr::str_c("( ", ., " )") %>% - stringr::str_squish() - - } else if (flavor == "sql"){ - res = - rule %>% - stringr::str_replace_all("\\( ", "") %>% - stringr::str_replace_all(" \\)", "") %>% - - stringr::str_replace_all("%in%", "IN") %>% - stringr::str_replace_all("c\\(", "[") %>% - stringr::str_replace_all("\\)", "]") %>% - - stringr::str_replace_all("&", " ) AND (") %>% - - stringr::str_c("( ", ., " )") %>% - stringr::str_squish() - } - - attr(res, "flavor") = flavor - return(res) -} diff --git a/R/ruleclasses.R b/R/ruleclasses.R deleted file mode 100644 index 9fefc15..0000000 --- a/R/ruleclasses.R +++ /dev/null @@ -1,348 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -#' @name print.ruleset -#' @title Print method for ruleset class -#' @description Prints 'keys' and ruleset as a tidytable -#' @param x A ruleset object -#' @param ... Passed to `tidytable::print` -#' @return Input (invisibly) -#' @export -print.ruleset = function(x, ...){ - - rlang::inform(paste0("# A ruleset/tidytable with keys: ", - paste(attr(x, "keys"), collapse = ", ") - ) - ) - - class(x) = setdiff(class(x), "ruleset") - print(x, ...) - class(x) = c("ruleset", class(x)) - - return(invisible(x)) -} - -#' @name print.rulelist -#' @title Print method for rulelist class -#' @description Prints 'keys' and rulelist as a tidytable -#' @param x A rulelist object -#' @param ... Passed to `tidytable::print` -#' @return Input (invisibly) -#' @export -print.rulelist = function(x, ...){ - - rlang::inform(paste0("# A rulelist/tidytable with keys: ", - paste(attr(x, "keys"), collapse = ", ") - ) - ) - - class(x) = setdiff(class(x), "rulelist") - print(x, ...) - class(x) = c("rulelist", class(x)) - - return(invisible(x)) -} - -#' @name predict_core -#' @title Core predict for ruleset/list/tidy set of rules -#' @description Core logic of predict method is written in a generic sense. This -#' function will not be exposed at user level. -#' @param rules_df dataframe with at least two columns: `rule_nbr`, `LHS`. -#' Should have 'keys' columns such that `rule_nbr` along with 'keys' columns -#' form a unique combo per row -#' @param new_data Data to predict on -#' @return dataframe with these columns: `row_nbr` (integer), 'keys' columns, -#' `rule_nbr` (list of integers) -#' @details If a row number is not covered under any rule, then it does not -#' appear as a row in the output. -#' @examples -#' \dontrun{ -#' library("magrittr") -#' -#' # ruleset case -#' rpart::rpart(Species ~ .,data = iris) %>% -#' tidy() %>% -#' dplyr::select(rule_nbr, LHS) %>% -#' predict_core(iris) -#' -#' # rulelist case -#' C50::C5.0(species ~., -#' data = palmerpenguins::penguins, -#' trials = 5, -#' rules = TRUE -#' ) %>% -#' tidy() %>% -#' dplyr::select(rule_nbr, trial_nbr, LHS) %>% -#' predict_core(palmerpenguins::penguins) -#' } -predict_core = function(rules_df, new_data){ - - # rules_df should have key column (if any), rule_nbr and LHS - - # keys and 'rule_nbr' should form unique combinations - n_unique_combos = nrow(tidytable::distinct(rules_df, -LHS)) - - if (nrow(rules_df) != n_unique_combos) { - rlang::abort("'rule_nbr' and other keys should be unique combination") - } - - keys = setdiff(colnames(rules_df), c("LHS", "rule_nbr")) - - new_data_with_rn = tidytable::mutate(new_data, rn__ = tidytable::row_number()) - - # function to get row numbers for a given rule and dataset combo - # return: integer vector or NULL - get_rows_per_rule = function(rule_string, dataset){ - - rule_expr = parse(text = rule_string) - - row_numbers = - tidytable::filter(dataset, eval(rule_expr)) %>% - tidytable::pull(rn__) - - return(row_numbers) - } - - # function to get row numbers from a DF(chunk) of rules - # chunk should have two columns: rule_nbr, LHS - # returns a DF with two columns: rule_nbr, row_nbr - get_rows_df = function(chunk){ - - chunk %>% - tidytable::mutate(row_nbr = - purrr::map(LHS, - ~ get_rows_per_rule(.x, new_data_with_rn) - ) - ) %>% - tidytable::select(rule_nbr, row_nbr) - } - - res = - rules_df %>% - magrittr::set_class(setdiff(class(rules_df), c("rulelist", "ruleset"))) %>% - tidytable::nest(.by = keys) %>% - tidytable::mutate(data = purrr::map(data, get_rows_df)) %>% - tidytable::unnest(data) %>% - tidytable::unnest(row_nbr) %>% - tidytable::nest(.by = -rule_nbr) %>% - tidytable::mutate(rule_nbr = purrr::map(data, tidytable::pull)) %>% - tidytable::select(c("row_nbr", keys, "rule_nbr")) - - return(res) -} - -#' @name predict.rulelist -#' @title `predict` method for rulelist class -#' @description Returns the `rule_nbr` applicable for a `row_nbr` in new_data -#' @param object rulelist object -#' @param new_data dataframe to predict -#' @param raw (flag, default: FALSE) Whether raw prediction are to be provided -#' @param ... unused -#' @return A dataframe indicating `rule_nbr` applicable for a `row_nbr` in -#' new_data -#' @details -#' -#' If a `row_nbr` is covered more than one `rule_nbr` per 'keys', then -#' `rule_nbr` appearing in the earlier (as in row order) takes precedence. -#' -#' When raw is `FALSE`(default), output is a tidytable/dataframe with three or -#' more columns: `row_number` (int), columns corresponding to 'keys', `rule_nbr` -#' (int). If a row number is not covered by any rule, then there is one row with -#' all other columns other than `row_nbr` has a missing value. -#' -#' When raw is `TRUE`(default), output is a tidytable/dataframe with three or -#' more columns: `row_number` (int), columns corresponding to 'keys', `rule_nbr` -#' (list of intergers). If a row number is not covered by any rule, then there -#' is no row corresponding the `row_nbr`. -#' -#' @examples -#' model_c5 = C50::C5.0(species ~., -#' data = palmerpenguins::penguins, -#' trials = 5, -#' rules = TRUE -#' ) -#' tidy_c5 = tidy(model_c5) -#' tidy_c5 -#' -#' output_1 = predict(tidy_c5, palmerpenguins::penguins) -#' output_1 # different rules per 'keys' (`trial_nbr` here) -#' -#' output_2 = predict(tidy_c5, palmerpenguins::penguins, raw = TRUE) -#' output_2 # `rule_nbr` is a list-column of integer vectors -#' -#' @export -predict.rulelist = function(object, new_data, raw = FALSE, ...){ - - class(object) = setdiff(class(object), c("rulelist", "ruleset")) - keys = attr(object, "keys") - - if (raw){ - res = - object %>% - tidytable::select(c("rule_nbr", keys, "LHS")) %>% - predict_core(new_data) - - return(res) - } - - # idea - # 1. Get pref order for rules within a key. - # 2. merge with unnested raw predict. - # 3. Per key, choose a row_nbr served by the most prefered rule - # 4. Add empty rows for uncovered row numbers - - if (!is.null(keys)) { - res = - # 1. Get pref order for rules within a key. - object %>% - tidytable::select(c("rule_nbr", keys)) %>% - tidytable::group_by(tidytable::all_of(keys)) %>% - tidytable::mutate(pref__ = tidytable::row_number()) %>% - tidytable::ungroup() %>% - - # 2. merge with unnested raw predict. - tidytable::inner_join( - object %>% - tidytable::select(c("rule_nbr", keys, "LHS")) %>% - predict_core(new_data) %>% - tidytable::unnest(rule_nbr), - - by = c(keys, "rule_nbr") - ) %>% - - # 3. Per key, choose a row_nbr served by the most prefered rule - tidytable::slice_min(n = 1, - order_by = pref__, - by = c("row_nbr", keys) - ) %>% - tidytable::select(c("row_nbr", keys, "rule_nbr")) %>% - - # 4. Add empty rows for uncovered row numbers - tidytable::right_join( - object %>% - tidytable::distinct(keys) %>% - tidytable::cross_join(tidytable::tidytable(row_nbr = 1:nrow(new_data))), - by = c("row_nbr", keys) - ) %>% - tidytable::arrange(row_nbr) - - } else { - - res = - # 1. Get pref order for rules within a key. - object %>% - tidytable::select("rule_nbr") %>% - tidytable::mutate(pref__ = tidytable::row_number()) %>% - - # 2. merge with unnested raw predict. - tidytable::inner_join( - object %>% - tidytable::select(c("rule_nbr", "LHS")) %>% - predict_core(new_data) %>% - tidytable::unnest(rule_nbr), - - by = "rule_nbr" - ) %>% - - # 3. Per key, choose a row_nbr served by the most preferred rule - tidytable::slice_min(n = 1, - order_by = pref__, - by = "row_nbr" - ) %>% - tidytable::select(c("row_nbr", "rule_nbr")) %>% - - # 4. Add empty rows for uncovered row numbers - tidytable::right_join( - tidytable::tidytable(row_nbr = 1:nrow(new_data)), - by = "row_nbr" - ) %>% - tidytable::arrange(row_nbr) - } - - return(res) -} - -#' @name predict.ruleset -#' @title `predict` method for ruleset class -#' @description Returns the `rule_nbr` applicable for a `row_nbr` in new_data -#' @param object ruleset object -#' @param new_data dataframe to predict -#' @param raw (flag, default: FALSE) Whether raw prediction are to be provided -#' @param ... unused -#' @return A dataframe indicating `rule_nbr` applicable for a `row_nbr` in -#' new_data -#' @details -#' -#' A `row_nbr` is covered more than one `rule_nbr` per 'keys', results in error. -#' -#' When raw is `FALSE`(default), output is a tidytable/dataframe with three or -#' more columns: `row_number` (int), columns corresponding to`keys`, `rule_nbr` -#' (int). If a row number is not covered by any rule, then there is one row with -#' all other columns other than `row_nbr` has a missing value. -#' -#' When raw is `TRUE`(default), output is a tidytable/dataframe with three or -#' more columns: `row_number` (int), columns corresponding to`keys`, `rule_nbr` -#' (list of integers). If a row number is not covered by any rule, then there is -#' no row corresponding the `row_nbr`. -#' -#' @examples -#' model_rpart = rpart::rpart(species ~ ., -#' data = palmerpenguins::penguins -#' ) -#' tidy_rpart = tidy(model_rpart) -#' tidy_rpart -#' -#' output_1 = predict(tidy_rpart, palmerpenguins::penguins) -#' output_1 -#' -#' output_2 = predict(tidy_rpart, palmerpenguins::penguins, raw = TRUE) -#' output_2 # `rule_nbr` is a list-column of integer vectors -#' -#' @export -predict.ruleset = function(object, new_data, raw = FALSE, ...){ - - class(object) = setdiff(class(object), c("rulelist", "ruleset")) - keys = attr(object, "keys") - - res = - object %>% - tidytable::select(c("rule_nbr", keys, "LHS")) %>% - predict_core(new_data) - - if (raw) return(res) - - if (any(purrr::map_int(res$rule_nbr, length)) > 1) { - rlang::inform("Some rows are covered by more than one rule.") - rlang::inform("Run predict with `raw = TRUE` to see those cases.") - rlang::abort("A row should be not covered by more than one rule.") - } - - res = - res %>% - tidytable::mutate(rule_nbr = purrr::list_simplify(rule_nbr, strict = TRUE)) - - # Add empty rows for uncovered row numbers - if (!is.null(keys)) { - res = - res %>% - tidytable::right_join( - object %>% - tidytable::distinct(keys) %>% - tidytable::cross_join(tidytable::tidytable(row_nbr = 1:nrow(new_data))), - by = c("row_nbr", keys) - ) %>% - tidytable::arrange(row_nbr) - - } else { - res = - res %>% - tidytable::right_join( - tidytable::tidytable(row_nbr = 1:nrow(new_data)), - by = "row_nbr" - ) %>% - tidytable::arrange(row_nbr) - } - return(res) -} diff --git a/R/rulelist.R b/R/rulelist.R new file mode 100644 index 0000000..e5a14e0 --- /dev/null +++ b/R/rulelist.R @@ -0,0 +1,492 @@ +################################################################################ +# This is the part of the 'tidyrules' R package hosted at +# https://github.com/talegari/tidyrules with GPL-3 license. +################################################################################ + +################################################################################ +#### rulelist documentation +################################################################################ + +#' @name rulelist +#' @title Rulelist +#' @description +#' ## Structure +#' +#' A `rulelist` is ordered list of rules stored as a dataframe. Each row, +#' specifies a rule (LHS), expected outcome (RHS) and some other details. +#' +#' It has these mandatory columns: +#' +#' - `rule_nbr`: (integer vector) Rule number +#' - `LHS`: (character vector) A rule is a string that can be parsed using [base::parse()] +#' - `RHS`: (character vector or a literal) +#' +#' ## Example +#' +#' ``` +#' | rule_nbr|LHS |RHS | support| confidence| lift| +#' |--------:|:--------------------------------------------------------------------|:---------|-------:|----------:|--------:| +#' | 1|( island %in% c('Biscoe') ) & ( flipper_length_mm > 203 ) |Gentoo | 122| 1.0000000| 2.774193| +#' | 2|( island %in% c('Biscoe') ) & ( flipper_length_mm <= 203 ) |Adelie | 46| 0.9565217| 2.164760| +#' | 3|( island %in% c('Dream', 'Torgersen') ) & ( bill_length_mm > 44.1 ) |Chinstrap | 65| 0.9538462| 4.825339| +#' | 4|( island %in% c('Dream', 'Torgersen') ) & ( bill_length_mm <= 44.1 ) |Adelie | 111| 0.9459459| 2.140825| +#' ``` +#' +#' ## Create a rulelist +#' +#' A `rulelist` can be created using [tidy()] on some supported model fits +#' (run: `utils::methods(tidy)`). It can also be created manually from a +#' existing dataframe using [as_rulelist][as_rulelist.data.frame]. +#' +#' ## Keys and attributes +#' +#' Columns identified as 'keys' along with `rule_nbr` form a unique +#' combination +#' -- a group of rules. For example, rule-based C5 model with multiple trials +#' creates rules per each `trial_nbr`. `predict` method understands 'keys', +#' thereby provides/predicts a rule number (for each row in new data / test +#' data) within the same `trial_nbr`. +#' +#' A rulelist has these mandatory attributes: +#' - `estimation_type`: One among `regression`, `classification` +#' +#' A rulelist has these optional attributes: +#' - `keys`: (character vector)Names of the column that forms a key. +#' - `model_type`: (string) Name of the model +#' +#' ## Methods for rulelist +#' +#' 1. [Predict][predict.rulelist]: Given a dataframe (possibly without a +#' dependent variable column aka 'test data'), predicts the first rule (as +#' ordered in the rulelist) per 'keys' that is applicable for each row. When +#' `multiple = TRUE`, returns all rules applicable for a row (per key). +#' +#' 2. [Augment][augment.rulelist]: Given a dataframe (with dependent variable +#' column, aka validation data), creates summary statistics per rule and +#' returns a rulelist with a new dataframe-column. +#' +#' ## Manipulating a rulelist +#' +#' Rulelists are essentially dataframes. Hence, any dataframe operations which +#' preferably preserve attributes will output a rulelist. [as_rulelist] and +#' [as.data.frame] will help in moving back and forth between rulelist and +#' dataframe worlds. +#' +#' ## Utilities for a rulelist +#' +#' 1. [as_rulelist][as_rulelist.data.frame]: Create a `rulelist` from a +#' dataframe with some mandatory columns. 2. [set_keys]: Set or Unset 'keys' +#' of a `rulelist`. 3. [to_sql_case]: Outputs a SQL case statement for a +#' `rulelist`. 4. [convert_rule_flavor]: Converts `R`-parsable rule strings to +#' python/SQL parsable rule strings. +#' +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], +#' [predict][predict.rulelist] +identity # just a placeholder for 'rulelist' documentation, not exported + + +################################################################################ +#### print +################################################################################ + +#' @name print.rulelist +#' @title Print method for [rulelist] class +#' @description Prints [rulelist] attributes and first few rows. +#' @param x A [rulelist] object +#' @param ... Passed to `tidytable::print` +#' @return input [rulelist] (invisibly) +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist] +#' @family Core Rulelist Utility +#' @export +print.rulelist = function(x, ...){ + keys = attr(x, "keys") + + cli::cli_rule(left = "Rulelist") + + if (is.null(keys)) { + cli::cli_alert_info("{.emph keys}: {.strong NULL}") + } else { + cli::cli_alert_info("{.emph keys}: {.val {keys}}") + n_combo = nrow(distinct(select(x, all_of(keys)))) + cli::cli_alert_info("{.emph Number of distinct keys}: {.val {n_combo}}") + } + + cli::cli_alert_info("{.emph Number of rules}: {.val {nrow(x)}}") + + model_type = attr(x, 'model_type') + if (is.null(model_type)){ + cli::cli_alert_info("{.emph Model type}: {.strong NULL}") + } else { + cli::cli_alert_info("{.emph Model type}: {.val {model_type}}") + } + + estimation_type = attr(x, 'estimation_type') + if (is.null(estimation_type)){ + cli::cli_alert_info("{.emph estimation type}: {.strong NULL}") + } else { + cli::cli_alert_info("{.emph estimation type}: {.val {estimation_type}}") + } + cli::cli_text("") + + class(x) = setdiff(class(x), "rulelist") + print(x, ...) + class(x) = c("rulelist", class(x)) + + return(invisible(x)) +} + +################################################################################ +#### predict +################################################################################ + +#' @keywords internal +#' @name predict_all_nokeys_rulelist +#' @title as the name says +#' @description as the name says +#' @param rulelist rulelist +#' @param new_data new_data +#' @return dataframe +# Not to be exported +predict_all_nokeys_rulelist = function(rulelist, new_data){ + + # new_data is expected to inherit "data.table" + new_data2 = rlang::duplicate(new_data) + new_data2[["row_nbr"]] = 1:nrow(new_data2) + + out = vector("list", nrow(rulelist)) + for (rn in 1:nrow(rulelist)) { + + mask = eval(parse(text = rulelist$LHS[rn]), new_data2) + mask = ifelse(is.na(mask), FALSE, mask) + out[[rn]] = new_data2$row_nbr[mask] + } + + res = + tidytable::tidytable(rule_nbr = 1:nrow(rulelist), + row_nbr = out + ) %>% + unnest(row_nbr, keep_empty = TRUE) %>% + tidytable::full_join(tidytable::tidytable(row_nbr = 1:nrow(new_data))) + + return(res) +} + +#' @keywords internal +#' @name predict_all_rulelist +#' @title with or without keys +#' @description uses predict_all_nokeys_rulelist +#' @param rulelist rulelist +#' @param new_data new_data +#' @return dataframe +# Not to be exported +predict_all_rulelist = function(rulelist, new_data){ + + new_data = data.table::as.data.table(new_data) + keys = attr(rulelist, "keys", exact = TRUE) + + if (is.null(keys)) { + + res = + predict_all_nokeys_rulelist(rulelist, new_data) %>% + arrange(row_nbr) %>% + select(row_nbr, rule_nbr) %>% + nest(.by = row_nbr, .key = "rule_nbr") %>% + mutate(rule_nbr = purrr::map(rule_nbr, ~ .x[[1]])) + + } else { + + res = + rulelist %>% + as.data.frame() %>% + nest(data = tidytable::everything(), .by = keys) %>% + mutate(rn_df = purrr::map(data, ~ predict_all_nokeys_rulelist(.x, new_data))) %>% + select(-data) %>% + unnest(rn_df) %>% + drop_na(row_nbr) %>% + select(all_of(c("row_nbr", keys, "rule_nbr"))) %>% + arrange(!!!rlang::syms(c("row_nbr", keys, "rule_nbr"))) %>% + nest(.by = c("row_nbr", keys), .key = "rule_nbr") %>% + mutate(rule_nbr = purrr::map(rule_nbr, ~ .x[[1]])) + } + + return(res) +} + +#' @keywords internal +#' @name predict_nokeys_rulelist +#' @title as the name says +#' @description as the name says +#' @param rulelist rulelist +#' @param new_data new_data +#' @return dataframe +# not to be exported +predict_nokeys_rulelist = function(rulelist, new_data){ + + # new_data is expected to inherit "data.table" + new_data2 = rlang::duplicate(new_data) + new_data2[["row_nbr"]] = 1:nrow(new_data2) + + out = vector("list", nrow(rulelist)) + for (rn in 1:nrow(rulelist)) { + + mask = eval(parse(text = rulelist$LHS[rn]), new_data2) + mask = ifelse(is.na(mask), FALSE, mask) + out[[rn]] = new_data2$row_nbr[mask] + + if (sum(mask > 0)) { + new_data2 = new_data2[!mask] + } + + if (nrow(new_data2) == 0) { + break + } + } + + res = + tidytable::tidytable(rule_nbr = 1:nrow(rulelist), + row_nbr = out + ) %>% + unnest(row_nbr, keep_empty = TRUE) %>% + tidytable::full_join(tidytable::tidytable(row_nbr = 1:nrow(new_data))) + + return(res) +} + +#' @keywords internal +#' @name predict_rulelist +#' @title with or without keys +#' @description uses predict_nokeys_rulelist +#' @param rulelist rulelist +#' @param new_data new_data +#' @return dataframe +# Not to be exported +predict_rulelist = function(rulelist, new_data){ + + new_data = data.table::as.data.table(new_data) + keys = attr(rulelist, "keys", exact = TRUE) + + if (is.null(keys)) { + + res = + predict_nokeys_rulelist(rulelist, new_data) %>% + arrange(row_nbr) %>% + select(row_nbr, rule_nbr) + + } else { + + res = + rulelist %>% + as.data.frame() %>% + nest(data = tidytable::everything(), .by = keys) %>% + mutate(rn_df = purrr::map(data, ~ predict_nokeys_rulelist(.x, new_data))) %>% + select(-data) %>% + unnest(rn_df) %>% + drop_na(row_nbr) %>% + select(all_of(c("row_nbr", keys, "rule_nbr"))) %>% + arrange(!!!rlang::syms(c("row_nbr", keys, "rule_nbr"))) + } + + return(res) +} + +#' @name predict.rulelist +#' @title `predict` method for a [rulelist] +#' @description Predicts `rule_nbr` applicable (as per the order in rulelist) +#' for a `row_nbr` (per key) in new_data +#' @param object A [rulelist] +#' @param new_data (dataframe) +#' @param multiple (flag, default: FALSE) Whether to output all rule numbers +#' applicable for a row. If FALSE, the first satisfying rule is provided. +#' @param ... unused +#' @return dataframe. See **Details**. +#' @details If a `row_nbr` is covered more than one `rule_nbr` per 'keys', then +#' `rule_nbr` appearing earlier (as in row order of the [rulelist]) takes +#' precedence. +#' ## Output Format +#' - When multiple is `FALSE`(default), output is a dataframe with three +#' or more columns: `row_number` (int), columns corresponding to 'keys', +#' `rule_nbr` (int). +#' +#' - When multiple is `TRUE`(default), output is a tidytable/dataframe with three +#' or more columns: `row_number` (int), columns corresponding to 'keys', +#' `rule_nbr` (list column of integers). +#' +#' - If a row number and 'keys' combination is not covered by any rule, then `rule_nbr` column has missing value. +#' +#' @examples +#' model_c5 = C50::C5.0(species ~., +#' data = palmerpenguins::penguins, +#' trials = 5, +#' rules = TRUE +#' ) +#' tidy_c5 = tidy(model_c5) +#' tidy_c5 +#' +#' output_1 = predict(tidy_c5, palmerpenguins::penguins) +#' output_1 # different rules per 'keys' (`trial_nbr` here) +#' +#' output_2 = predict(tidy_c5, palmerpenguins::penguins, multiple = TRUE) +#' output_2 # `rule_nbr` is a list-column of integer vectors +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist] +#' @importFrom stats predict +#' @family Core Rulelist Utility +#' @export +predict.rulelist = function(object, new_data, multiple = FALSE, ...){ + + checkmate::assert_data_frame(new_data) + checkmate::assert_flag(multiple) + + if (multiple) { + res = predict_all_rulelist(object, new_data) + } else { + res = predict_rulelist(object, new_data) + } + + return(res) +} + +################################################################################ +#### coerce from dataframe +################################################################################ + +#' @name as_rulelist +#' @title as_rulelist generic from [tidyrules][package_tidyrules] package +#' @description as_rulelist generic +#' @param x object to be coerced to a [rulelist] +#' @param ... for methods to use +#' @return A [rulelist] +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist] +#' @family Core Rulelist Utility +#' @export +as_rulelist = function(x, ...){ + UseMethod("as_rulelist", x) +} + +#' @name as_rulelist.data.frame +#' @title as_rulelist method for a data.frame +#' @description Convert a set of rules in a dataframe to a [rulelist] +#' @param x dataframe to be coerced to a [rulelist] +#' @param keys (character vector, default: NULL) column names which form the key +#' @param model_type (string, default: NULL) Name of the model which generated +#' the rules +#' @param estimation_type (string) One among: 'regression', +#' 'classification' +#' @param ... currently unused +#' @return [rulelist] object +#' @details Input dataframe should contain these columns: `rule_nbr`, `LHS`, +#' `RHS`. Providing other inputs helps augment better. +#' @examples +#' rules_df = tidytable::tidytable(rule_nbr = 1:2, +#' LHS = c("var_1 > 50", "var_2 < 30"), +#' RHS = c(2, 1) +#' ) +#' as_rulelist(rules_df, estimation_type = "regression") +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist] +#' @family Core Rulelist Utility +#' @export +as_rulelist.data.frame = function(x, + keys = NULL, + model_type = NULL, + estimation_type, + ... + ){ + checkmate::assert_character(keys, + min.len = 1, + any.missing = FALSE, + unique = TRUE, + null.ok = TRUE + ) + + # #### checks + # 1. basic cols exist. + # 2. keys are different from basic cols. + # 3. key columns exist. + # 4. key along with 'rule_nbr' form unique rows without missing values. + # 5. rule_nbr (integerish), LHS(character), RHS(any vector) should not have + # missing values. + # 6. 'estimation_type' should be one among: classification, regression + + # check on basic columns and 'key' columns + basic_cols = c("rule_nbr", "LHS", "RHS") + if (is.null(keys)) { + checkmate::assert_subset(basic_cols, colnames(x)) + # create key combo + key_combo_df = distinct(x, rule_nbr) + + } else { + + # keys should be different from basic cols + if (length(intersect(keys, basic_cols)) > 0) { + rlang::abort("keys should not one among: 'rule_nbr', 'LHS', 'RHS'") + } + # expected columns exist exist + checkmate::assert_subset(c(basic_cols, keys), colnames(x)) + # create key combo + key_combo_df = distinct(select(x, all_of(c("rule_nbr", keys)))) + } + + checkmate::assert_true(anyDuplicated(key_combo_df) == 0) + checkmate::assert_false(anyNA(key_combo_df)) + + checkmate::assert_integerish(x$rule_nbr, any.missing = FALSE) + checkmate::assert_character(x$LHS, any.missing = FALSE) + checkmate::assert_vector(x$RHS, any.missing = FALSE) + + checkmate::assert_string(model_type, null.ok = TRUE) + checkmate::assert_string(estimation_type) + checkmate::assert_subset(estimation_type, c("classification", "regression")) + + # set class and attributes + res = rlang::duplicate(x) + + class(res) = c("rulelist", class(res)) + if (!is.null(model_type)) { + attr(res, "model_type") = model_type + } + + attr(res, "estimation_type") = estimation_type + + return(res) +} + +################################################################################ +#### set_keys +################################################################################ + +#' @name set_keys +#' @title Set keys for a [rulelist] +#' @description 'keys' are a set of column(s) whose unique combination +#' identifies a group of rules in a [rulelist]. Methods like +#' [predict.rulelist], [augment.rulelist] produce output per key combination. +#' @param x A [rulelist] +#' @param keys (character vector or NULL) +#' @return A [rulelist] object +#' @details A new [rulelist] is returned with attr `keys` is modified. The input +#' [rulelist] object is unaltered. +#' @examples +#' model_c5 = C50::C5.0(Attrition ~., data = modeldata::attrition, rules = TRUE) +#' tidy_c5 = tidy(model_c5) +#' tidy_c5 # keys are: "trial_nbr" +#' +#' new_tidy_c5 = set_keys(tidy_c5, NULL) # remove all keys +#' new_tidy_c5 +#' new_2_tidy_c5 = set_keys(new_tidy_c5, "trial_nbr") # set "trial_nbr" as key +#' new_2_tidy_c5 +#' +#' # Note that `tidy_c5` and `new_tidy_c5` are not altered. +#' tidy_c5 +#' new_tidy_c5 +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist] +#' @family Core Rulelist Utility +#' @export +set_keys = function(x, keys){ + + checkmate::assert_character(keys, null.ok = TRUE) + if (!is.null(keys)){ + checkmate::assert_subset(keys, colnames(x)) + checkmate::assert_false(any(c("LHS", "RHS", "row_nbr") %in% keys)) + } + res = rlang::duplicate(x) + attr(res, "keys") = keys + return(res) +} diff --git a/R/tidy.R b/R/tidy.R new file mode 100644 index 0000000..2fa526e --- /dev/null +++ b/R/tidy.R @@ -0,0 +1,992 @@ +################################################################################ +# This is the part of the 'tidyrules' R package hosted at +# https://github.com/talegari/tidyrules with GPL-3 license. +################################################################################ + +#' @name tidy +#' @title `tidy` is re-export of [generics::tidy] from +#' [tidyrules][package_tidyrules] package +#' @description `tidy` applied on a supported model fit creates a [rulelist]. +#' **See Also** section links to documentation of specific methods. +#' @param x A supported model object +#' @param ... For model specific implementations to use +#' @seealso [tidy], [tidy.C5.0], [tidy.rpart], [tidy.constparty], [tidy.cubist], +#' [rulelist], [augment][augment.rulelist], [predict][predict.rulelist] +#' @importFrom generics tidy +#' @family Core Tidy Utility +#' @export +generics::tidy + +#' @name tidy.C5.0 +#' @title Get the [rulelist] from a [C5][C50::C5.0] model +#' @description Each row corresponds to a rule per `trial_nbr` +#' @param x [C50::C5.0] model fitted with `rules = TRUE` +#' @param ... Other arguments (See details) +#' @return A [rulelist] object +#' @details +#' - The output columns are: `rule_nbr`, `trial_nbr`, `LHS`, `RHS`, +#' `support`, `confidence`, `lift`. +#' - Rules per `trial_nbr` are sorted in this order: `desc(confidence)`, +#' `desc(lift)`, `desc(support)`. +#' +#' Optional named arguments: +#' - `laplace` (flag, default: TRUE) is supported. This +#' computes confidence with laplace correction as documented under 'Rulesets' +#' here: [C5 doc](https://www.rulequest.com/see5-unix.html). +#' +#' @examples +#' model_c5 = C50::C5.0(Attrition ~., data = modeldata::attrition, rules = TRUE) +#' tidy(model_c5) +#' +#' @seealso [tidy], [tidy.C5.0], [tidy.rpart], [tidy.constparty], [tidy.cubist], +#' [rulelist], [augment.rulelist], [predict.rulelist] +#' @family Core Tidy Utility +#' @export + +tidy.C5.0 = function(x, ...){ + + #### checks ################################################################# + + arguments = list(...) + arguments[["laplace"]] = arguments[["laplace"]] %||% TRUE + + # for magrittr dot + . = NULL + + if (!x[["rbm"]]){ + rlang::abort("Model should be built using `rules = TRUE` argument.") + } + + # output of the model + output = x[["output"]] + + # get variable specification + var_spec = varSpec(x) + variable_names = var_spec[["variable"]] + col_classes = var_spec[["type"]] + names(col_classes) = variable_names + + # throw error if there is consecutive spaces + # output from the model squishes the spaces + if (any(stringr::str_count(variable_names, " ") > 0)){ + rlang::abort("Variable names should not two or more consecutive spaces.") + } + + #### core logic ############################################################## + # extract rules part + spl = + output %>% + stringr::str_replace_all("\t", "") %>% # remove tab spaces + stringr::str_replace_all("\n ", "") %>% # handle multiline lineitems + strSplitSingle("\n") # split along newlines + + # detect where the rules start + start_rules_position = min(which(stringr::str_detect(spl, "^Rule "))) + + # detect where the rules end + end_rules_position = + stringr::str_detect(spl, "^Evaluation on training data") %>% + which() %>% + magrittr::subtract(1) %>% + min() + + # get the rules part + spl = spl[start_rules_position:end_rules_position] %>% + stringr::str_squish() %>% + removeEmptyLines() + + ## get raw rules by splitting + + # every rule starts with 'Rule' + cuts = which(stringr::str_detect(spl, "^Rule ")) + # end of rule is a line before the start of next rule + cuts2 = which(stringr::str_detect(spl, "^\\-\\> ")) + + # split rules + rules_raw = purrr::map2(cuts, cuts2, function(x, y) spl[x:y]) + + ## function to get a parsable rule from a raw rule + getRules = function(single_raw_rule){ + + # empty list container + rule = list() + + # get stats from first line ---- + first_line = single_raw_rule[1] + + # A typical first line looks like: + #************************************************** + # "Rule 0/1: (521/30, lift 1.1)", ":" + #************************************************** + + index = strSplitSingle(first_line, ":") %>% + magrittr::extract(1) %>% + strSplitSingle("\\s") %>% + magrittr::extract(2) %>% + strSplitSingle("/") + + if (length(index) == 2){ + rule[["rule_number"]] = as.integer(index[2]) + rule[["trial_number"]] = as.integer(index[1]) + 1L + } else { + rule[["rule_number"]] = as.integer(index) + rule[["trial_number"]] = 1L + } + + stats = + strSplitSingle(first_line, ":") %>% + magrittr::extract(2) %>% + strSplitSingle("\\(") %>% + magrittr::extract(2) %>% + strSplitSingle("\\)") %>% + magrittr::extract(1) %>% + strSplitSingle(",") %>% + stringr::str_squish() + + support_confidence = strSplitSingle(stats[1], "/") + if (length(support_confidence) > 1){ + + # extract support + rule[["support"]] = as.integer(support_confidence[1]) + + # compute confidence (not extract) + if (arguments[["laplace"]]){ + + # C5 doc computes confidence using laplace correction + # (n-m+1)/(n+2) + # n: number of obs in leaf + # m: number of musclassifications among n + rule[["confidence"]] = + rule[["support"]] %>% + magrittr::subtract(as.integer(support_confidence[2])) %>% + magrittr::add(1) %>% + magrittr::divide_by(rule[["support"]] + 2) %>% + round(4) + + } else { + + # without laplace correction + # simply: (n-m)/n + rule[["confidence"]] = + rule[["support"]] %>% + magrittr::subtract(as.integer(support_confidence[2])) %>% + magrittr::divide_by(rule[["support"]]) %>% + round(4) + } + + } else { + + rule[["support"]] = as.integer(support_confidence) + # see comments for laplace above + if (arguments[["laplace"]]){ + rule[["confidence"]] = (rule[["support"]] + 1)/(rule[["support"]] + 2) + } else{ + rule[["confidence"]] = 1 + } + } + + rule[["lift"]] = + strSplitSingle(stats[2], "\\s") %>% + magrittr::extract(2) %>% + as.numeric() + + # curate a single line item of the rule ---- + line_item_curator = function(line_item){ + + # in unforeseen cases just return the rule string + # let the parsing test catch it + line_item_rule = line_item + + # 'in' separator for a single line item of rule + # ex1: JobInvolvement in [Low-Medium] for ordered factors + # ex2: JobRole in {Laboratory_Technician, Sales_Representative} + if (stringr::str_detect(line_item, "\\sin\\s")){ + split_line_item = strSplitSingle(line_item, "\\sin\\s") + lhs_line_item = split_line_item[1] + rhs_line_item = split_line_item[2] + + # unordered factor case + if (stringr::str_detect(line_item, "\\{")){ + rhs_line_item = + rhs_line_item %>% + strHead(-1) %>% # remove quotes + strTail(-1) %>% + strSplitSingle(",") %>% # split the list by comma + stringr::str_trim() %>% # trim if any + # add quotes around levels + purrr::map_chr(function(x) stringr::str_c("'", x, "'")) %>% + stringr::str_c(collapse = ", ") %>% # bind with comma + stringr::str_c("c(", ., ")") # create 'c' structure + + line_item_rule = stringr::str_c(lhs_line_item, + " %in% ", + rhs_line_item + ) + } + + # unordered factor case + if (stringr::str_detect(line_item, "\\[")){ + rhs_line_item = + rhs_line_item %>% + strHead(-1) %>% + strTail(-1) + + # more than one hyphen means some factor level has hyphen + if (stringr::str_count(rhs_line_item, "-") > 1){ + rlang::abort("factor levels cannot have '-'.") + } + + rhs_line_item = rhs_line_item %>% + strSplitSingle("-") %>% + stringr::str_squish() # in case there is space + + # get the levels of the variable + levels = + var_spec[var_spec[["variable"]] == lhs_line_item, ] %>% + as.list() %>% + magrittr::extract2("levels") %>% + magrittr::extract2(1) + + # get all levels between start and end level + start_level = which(levels == rhs_line_item[1]) + end_level = which(levels == rhs_line_item[2]) + + # construct RHS of the line item + rhs_line_item = + levels[start_level:end_level] %>% + stringr::str_c("'", ., "'") %>% + stringr::str_c(collapse = ", ") %>% + stringr::str_c("c(", ., ")") + + # complete line rule + line_item_rule = stringr::str_c(lhs_line_item, + " %in% ", + rhs_line_item + ) + } + } + + # handle '=' case + # ex: MaritalStatus = Single + contains_equals = stringr::str_detect(line_item, " = ") + if (contains_equals){ + + sub_rule = + strSplitSingle(line_item, "=") %>% + stringr::str_trim() + + the_class = col_classes[[ sub_rule[1] ]] + + # quote if non-numeric + if (!(the_class %in% c("numeric", "integer"))){ + sub_rule[2] = stringr::str_c("'", sub_rule[2], "'") + } + + line_item_rule = stringr::str_c(sub_rule, collapse = " == ") + + } + + line_item_rule = paste0("( ", line_item_rule, " )") + return(line_item_rule) + } + + # create LHS and RHS ---- + rule[["LHS"]] = + single_raw_rule %>% + utils::tail(-1) %>% # remove first stats line + utils::head(-1) %>% # remove the RHS line + purrr::map(line_item_curator) %>% # get clean rule lines + stringr::str_c(collapse = " & ") # concat them with '&' + + rule[["RHS"]] = + single_raw_rule %>% + utils::tail(1) %>% # get the RHS line + stringr::str_squish() %>% # remove multispaces + strSplitSingle("\\s") %>% # split by space + magrittr::extract(3) # extract the RHS name + + # return cleaned rule ---- + return(rule) + } + + # apply rule tidying for each rule and return tibble + res = + purrr::map(rules_raw, getRules) %>% + purrr::transpose() %>% + purrr::simplify_all() %>% + tidytable::as_tidytable() + + #### finalize output ######################################################### + # replace variable names with spaces within backquotes + for (i in 1:length(variable_names)){ + res[["LHS"]] = + stringr::str_replace_all(res[["LHS"]], + variable_names[i], + addBackquotes(variable_names[i]) + ) + } + + #### return ################################################################## + res = + res %>% + select(trial_nbr = trial_number, + LHS, RHS, + support, confidence, lift + ) %>% + arrange(trial_nbr, desc(confidence), desc(lift), desc(support)) %>% + mutate(rule_nbr = 1:n(), .by = trial_nbr) %>% + mutate(RHS = factor(RHS)) %>% + relocate(rule_nbr, trial_nbr) + + class(res) = c("rulelist", class(res)) + + attr(res, "keys") = "trial_nbr" + attr(res, "model_type") = "C5" + attr(res, "estimation_type") = "classification" + + return(res) +} + +#' @name tidy.rpart +#' @title Get the [rulelist] from a [rpart][rpart::rpart] model +#' @description Each row corresponds to a rule +#' @param x [rpart::rpart] model +#' @param ... Other arguments (currently unused) +#' @return A [rulelist] object +#' @details For rpart rules, one should build the model without [ordered +#' factor][base::ordered] variable. We recommend you to convert [ordered +#' factor][base::ordered] to [factor][base::factor] or *integer* class. +#' +#' For [rpart::rpart] classification model: +#' - Output columns are: `rule_nbr`, `LHS`, `RHS`, `support`, `confidence`, `lift`. +#' - The rules are sorted in this order: `desc(confidence)`, `desc(lift)`, +#' `desc(support)`. +#' +#' For [rpart::rpart] regression(anova) model: +#' - Output columns are: `rule_nbr`, `LHS`, `RHS`, `support`. +#' - The rules are sorted in this order: `desc(support)`. +#' @examples +#' model_class_rpart = rpart::rpart(Species ~ ., data = iris) +#' tidy(model_class_rpart) +#' +#' model_regr_rpart = rpart::rpart(Sepal.Length ~ ., data = iris) +#' tidy(model_regr_rpart) +#' @seealso [tidy], [tidy.C5.0], [tidy.rpart], [tidy.constparty], [tidy.cubist], +#' [rulelist], [augment.rulelist], [predict.rulelist] +#' @family Core Tidy Utility +#' @export + +tidy.rpart = function(x, ...){ + + ##### assertions and prep #################################################### + arguments = list(...) + + # supported 'rpart' classes + method_rpart = x$method + # classification: class, regression: anova + checkmate::assert_choice(method_rpart, c("class", "anova")) + + # build with y = TRUE + if (is.null(x$y)){ + rlang::abort("rpart model should be built using argument `y = TRUE`.") + } + + # column names from the x: This will be used at the end to handle the + # variables with a space + col_names = + attr(x$terms, which = "term.labels") %>% + stringr::str_remove_all(pattern = "`") + + # throw error if there are consecutive spaces in the column names + if (any(stringr::str_count(col_names, " ") > 0)){ + rlang::abort( + "Variable names should not have two or more consecutive spaces.") + } + + #### core extraction work #################################################### + + # convert to class "party" + party_obj = partykit::as.party(x) + + # extract rules + rules = + list.rules.party(party_obj) %>% + stringr::str_replace_all(pattern = "\\\"","'") %>% + stringr::str_remove_all(pattern = ", 'NA'") %>% + stringr::str_remove_all(pattern = "'NA',") %>% + stringr::str_remove_all(pattern = "'NA'") %>% + stringr::str_squish() %>% + stringr::str_split(" & ") %>% + purrr::map(~ stringr::str_c("( ", .x, " )")) %>% + purrr::map_chr(~ stringr::str_c(.x, collapse = " & ")) + + terminal_nodes = partykit::nodeids(party_obj, terminal = TRUE) + + # create metrics df + if (method_rpart == "class"){ + prevalence = as.numeric(prop.table(table(x$y))) + + res = + x$frame[terminal_nodes, c("n", "dev", "yval")] %>% + tidytable::mutate(confidence = (n + 1 - dev) / (n + 2)) %>% + tidytable::rename(support = n, predict_class = yval) %>% + tidytable::mutate(RHS = attr(x, "ylevels")[predict_class]) %>% + tidytable::mutate(prevalence = prevalence[predict_class]) %>% + tidytable::mutate(lift = confidence / prevalence) %>% + tidytable::mutate(LHS = rules) + + } else if (method_rpart == "anova"){ + res = + x$frame[terminal_nodes, c("n","yval")] %>% + tidytable::rename(support = n, RHS = yval) %>% + tidytable::mutate(LHS = rules) + } + + #### finalize output ######################################################### + + # replace variable names with spaces within backquotes + for (i in 1:length(col_names)) { + res[["LHS"]] = + stringr::str_replace_all(res[["LHS"]], + col_names[i], + addBackquotes(col_names[i]) + ) + } + + #### return ################################################################## + + res[["rule_nbr"]] = 1:nrow(res) + + if (method_rpart == "class"){ + res = + res %>% + select(LHS, RHS, support, confidence, lift) %>% + arrange(desc(confidence), desc(lift), desc(support)) %>% + mutate(rule_nbr = 1:n()) %>% + mutate(RHS = factor(RHS)) %>% + relocate(rule_nbr) + + } else if (method_rpart == "anova") { + res = + res %>% + select(rule_nbr, LHS, RHS, support) %>% + arrange(desc(support)) %>% + mutate(rule_nbr = 1:n()) %>% + relocate(rule_nbr) + } + + class(res) = c("rulelist", class(res)) + + attr(res, "keys") = NULL + attr(res, "model_type") = "rpart" + if (method_rpart == "class"){ + attr(res, "estimation_type") = "classification" + } else if (method_rpart == "anova"){ + attr(res, "estimation_type") = "regression" + } + + return(res) +} + +#' @name tidy.constparty +#' @title Get the [rulelist] from a [party][partykit::party] model +#' @description Each row corresponds to a rule +#' @param x [partykit::party] model typically built using [partykit::ctree] +#' @param ... Other arguments (currently unused) +#' @return A [rulelist] object +#' @details These types of [party][partykit::party] models are supported: +#' `regression` (y is numeric), `classification` (y is factor) +#' +#' For [party][partykit::party] classification model: +#' +#' - Output columns are: `rule_nbr`, `LHS`, `RHS`, `support`, `confidence`, `lift`, `terminal_node_id`. +#' - Rules are sorted in this order: `desc(confidence)`, `desc(lift)`, +#' `desc(support)`. +#' +#' For [party][partykit::party] regression model: +#' +#' - Output columns are: `rule_nbr`, `LHS`, `RHS`, `support`, `IQR`, `RMSE`, `terminal_node_id`. +#' - Rules are sorted in this order: `RMSE`, `desc(support)`. +#' @examples +#' pen = palmerpenguins::penguins +#' model_class_party = partykit::ctree(species ~ ., data = pen) +#' tidy(model_class_party) + +#' model_regr_party = partykit::ctree(bill_length_mm ~ ., data = pen) +#' tidy(model_regr_party) +#' @seealso [tidy], [tidy.C5.0], [tidy.rpart], [tidy.constparty], [tidy.cubist], +#' [rulelist], [augment.rulelist], [predict.rulelist] +#' @family Core Tidy Utility +#' @export + +tidy.constparty = function(x, ...){ + + ##### assertions and prep #################################################### + arguments = list(...) + + # column names from the x: This will be used at the end to handle the + # variables with a space + col_names = + attr(x$terms, which = "term.labels") %>% + stringr::str_remove_all(pattern = "`") + + # throw error if there are consecutive spaces in the column names + if (any(stringr::str_count(col_names, " ") > 0)){ + rlang::abort( + "Variable names should not have two or more consecutive spaces.") + } + + # detect method using 'fitted' + fitted_df = tidytable::as_tidytable(x$fitted) + colnames(fitted_df) = c("terminal_node_id", "weight", "response") + fitted_df[["terminal_node_id"]] = as.character(fitted_df[["terminal_node_id"]]) + + y_class = class(fitted_df[["response"]]) + if (y_class == "factor") { + type = "classification" + } else if (y_class %in% c("numeric", "integer")) { + type = "regression" + } else { + rlang::inform("tidy supports only classification and regression 'party' models") + rlang::abort("Unsupported party object") + } + + #### core extraction work #################################################### + + # extract rules + raw_rules = list.rules.party(x) + + rules_df = + raw_rules %>% + stringr::str_replace_all(pattern = "\\\"","'") %>% + stringr::str_remove_all(pattern = ", 'NA'") %>% + stringr::str_remove_all(pattern = "'NA',") %>% + stringr::str_remove_all(pattern = "'NA'") %>% + stringr::str_squish() %>% + stringr::str_split(" & ") %>% + purrr::map(~ stringr::str_c("( ", .x, " )")) %>% + purrr::map_chr(~ stringr::str_c(.x, collapse = " & ")) %>% + tidytable::tidytable(LHS = .) %>% + tidytable::mutate(terminal_node_id = names(raw_rules)) + + # create metrics df + if (type == "classification"){ + + terminal_response_df = + fitted_df %>% + tidytable::summarise(sum_weight = sum(weight, na.rm = TRUE), + .by = c(terminal_node_id, response) + ) %>% + tidytable::slice_max(n = 1, + order_by = sum_weight, + by = terminal_node_id, + with_ties = FALSE + ) %>% + tidytable::select(terminal_node_id, + winning_response = response + ) + + prevalence_df = + fitted_df %>% + tidytable::summarise(prevalence = sum(weight, na.rm = TRUE), + .by = response + ) %>% + tidytable::mutate(prevalence = prevalence / sum(prevalence)) %>% + tidytable::select(response, prevalence) + + res = + fitted_df %>% + # bring 'winning_response' column + tidytable::left_join(terminal_response_df, + by = "terminal_node_id" + ) %>% + # bring 'prevalence' column + tidytable::left_join(prevalence_df, + by = c("winning_response" = "response") + ) %>% + tidytable::summarise( + support = sum(weight), + confidence = weighted.mean(response == winning_response, weight, na.rm = TRUE), + lift = weighted.mean(response == winning_response, weight, na.rm = TRUE) / prevalence[1], + RHS = winning_response[1], + .by = terminal_node_id + ) %>% + tidytable::left_join(rules_df, by = "terminal_node_id") %>% + tidytable::arrange(desc(confidence), desc(lift), desc(support)) %>% + tidytable::mutate(., rule_nbr = 1:nrow(.)) %>% + mutate(RHS = factor(RHS)) %>% + tidytable::select(rule_nbr, LHS, RHS, + support, confidence, lift, + terminal_node_id + ) + + } else if (type == "regression"){ + + res = + fitted_df %>% + tidytable::mutate(average = weighted.mean(response, weight, na.rm = TRUE), + .by = terminal_node_id + ) %>% + tidytable::summarise( + support = sum(weight), + IQR = DescTools::IQRw(response, weight, na.rm = TRUE), + RMSE = MetricsWeighted::rmse(actual = response, + predicted = average, + w = weight, + na.rm = TRUE + ), + average = mean(average), + .by = terminal_node_id + ) %>% + tidytable::left_join(rules_df, by = "terminal_node_id") %>% + tidytable::arrange(RMSE, desc(support)) %>% + tidytable::mutate(., rule_nbr = 1:nrow(.)) %>% + tidytable::select(rule_nbr, LHS, RHS = average, + support, IQR, RMSE, + terminal_node_id + ) + } + + #### finalize output ######################################################### + + # replace variable names with spaces within backquotes + for (i in 1:length(col_names)) { + res[["LHS"]] = + stringr::str_replace_all(res[["LHS"]], + col_names[i], + addBackquotes(col_names[i]) + ) + } + + #### return ################################################################## + + class(res) = c("rulelist", class(res)) + + attr(res, "keys") = NULL + attr(res, "model_type") = "constparty" + attr(res, "estimation_type") = type + + return(res) +} + +#' @name tidy.cubist +#' @title Get the [rulelist] from a [cubist][Cubist::cubist] model +#' @description Each row corresponds to a rule per `committee` +#' @param x [Cubist::cubist] model +#' @param ... Other arguments (currently unused) +#' @return A [rulelist] object +#' @details +#' - The output columns are: `rule_nbr`, `committee`, `LHS`, `RHS`, `support`, `mean`, `min`, `max`, `error`. +#' +#' - Rules are sorted in this order per committee: +#' `error`, `desc(support)` +#' @examples +#' att = modeldata::attrition +#' cols_att = setdiff(colnames(att), c("MonthlyIncome", "Attrition")) +#' model_cubist = Cubist::cubist(x = att[, cols_att], +#' y = att[["MonthlyIncome"]] +#' ) +#' tidy(model_cubist) +#' @seealso [tidy], [tidy.C5.0], [tidy.rpart], [tidy.constparty], [tidy.cubist], +#' [rulelist], [augment.rulelist], [predict.rulelist] +#' @family Core Tidy Utility +#' @export + +tidy.cubist = function(x, ...){ + + #### core rule extraction #################################################### + # output from the model + output = x[["output"]] + + # get variable specification + var_spec = varSpec(x) + variable_names = var_spec[["variable"]] + col_classes = var_spec[["type"]] + names(col_classes) = variable_names + + # throw error if there is consecutive spaces + # output from the model squishes the spaces + if(any(stringr::str_count(variable_names, " ") > 0)){ + rlang::abort("Variable names should not two or more consecutive spaces.") + } + + variable_names_with_ = + stringr::str_replace_all(variable_names, "\\s", "_") + + # split by newline and remove emptylines + lev_1 = + x[["output"]] %>% + strSplitSingle("\\n") %>% + removeEmptyLines() + + # remove everything from 'Evaluation on training data' onwards + evalLine = stringr::str_which(lev_1, "^Evaluation on training data") + lev_2 = + lev_1[-(evalLine:length(lev_1))] %>% + stringr::str_subset("^(?!Model).*$") + + # detect starts and ends of rules + rule_starts = stringr::str_which(stringr::str_trim(lev_2), "^Rule\\s") + # end of a rule is a line before the next rule start + rule_ends = c(utils::tail(rule_starts, -1) - 1, length(lev_2)) + + # create a rule list for cubist + get_rules_cubist = function(single_raw_rule){ + + # a raw rule looks like this: + # + # [1] " Rule 7: [87 cases, mean 15824.0, range 12061 to 17924, est err 694.4]" + # [2] " if" + # [3] "\tJobLevel <= 4" + # [4] "\tJobRole in {Manager, Research_Director}" + # [5] "\tTotalWorkingYears > 14" + # [6] " then" + # [7] "\toutcome = 4166.4 + 3467 JobLevel - 23 Age - 0.011 MonthlyRate" + + # example with equal sign inside + # Rule 1/14: [35 cases, mean 5.364152, range 4.963788 to 5.521399, est err 0.039525] + # + # if + # Year_Built > 1952 + # Bsmt_Exposure in {Av, Mn, No, No_Basement} + # Gr_Liv_Area <= 1692 + # Kitchen_Qual = Excellent + # then + # outcome = 2.533961 + 0.000252 Gr_Liv_Area + 0.0025 Year_Built + # + 0.002 Year_Remod_Add + 0.000105 Garage_Area + # - 0.00054 Lot_Frontage - 3.8e-05 Bsmt_Unf_SF + # + 2.4e-05 Total_Bsmt_SF + 7e-07 Lot_Area - 0.005 Bedroom_AbvGr + # + 0.003 Garage_Cars + 0.003 Fireplaces + 0.07 Longitude + # + 0.001 TotRms_AbvGrd + + res = list() + + # locate the position of square bracket and collect stats + firstLine = stringr::str_squish(single_raw_rule[1]) + openingSquareBracketPosition = stringr::str_locate(firstLine, "\\[")[1, 1] + + # All stats are at the begining of the rule + stat = + # between square brackets + stringr::str_sub(firstLine + , openingSquareBracketPosition + 1 + , stringr::str_length(firstLine) - 1 # closing ] bracket + ) %>% + strSplitSingle("\\,") %>% + stringr::str_trim() + + res[["support"]] = stat[1] %>% + strSplitSingle("\\s") %>% + magrittr::extract(1) %>% + as.integer() + + res[["mean"]] = stat[2] %>% + strSplitSingle(" ") %>% + magrittr::extract(2) %>% + as.numeric() + + res[["min"]] = stat[3] %>% + strSplitSingle(" ") %>% + magrittr::extract(2) %>% + as.numeric() + + res[["max"]] = stat[3] %>% + strSplitSingle(" ") %>% + magrittr::extract(4) %>% + as.numeric() + + res[["error"]] = stat[4] %>% + strSplitSingle(" ") %>% + magrittr::extract(3) %>% + as.numeric() + + # is if-then missing (only outcome is there) + if_exists = any(stringr::str_trim(single_raw_rule) == "if") + + if (if_exists){ + # get LHS + btw_if_then = + seq(which(stringr::str_trim(single_raw_rule) == "if") + 1, + which(stringr::str_trim(single_raw_rule) == "then") - 1 + ) + + # unclean LHS strings, one condition per string + lhsStrings = + single_raw_rule[btw_if_then] %>% + stringr::str_replace_all("\\t", "\\\\n") %>% + stringr::str_trim() %>% + stringr::str_c(collapse = " ") %>% + strSplitSingle("\\\\n") %>% + removeEmptyLines() %>% + stringr::str_trim() + + # function to get the one clean rule string + getRuleString = function(string){ + + # to avoid CRAN notes + . = NULL + + # if there is ' in {' in the string + if(stringr::str_detect(string, "\\sin\\s\\{")){ + + # split with ' in {' + var_lvls = strSplitSingle(string, "\\sin\\s\\{") + + # get the contents inside curly braces + lvls = + var_lvls[2] %>% + # omit the closing curly bracket + strHead(-1) %>% + strSplitSingle(",") %>% + stringr::str_trim() %>% + purrr::map_chr(function(x) stringr::str_c("'", x, "'")) %>% + stringr::str_c(collapse = ", ") %>% # note the space next to comma + stringr::str_c("c(", ., ")") + + # get the variable + var = stringr::str_trim(var_lvls[1]) + rs = stringr::str_c(var, " %in% ", lvls) + + } else { + + # handle '=' case + contains_equals = stringr::str_detect(string, " = ") + + if (contains_equals){ + + sub_rule = strSplitSingle(string, "=") %>% + stringr::str_trim() + + if(!(col_classes[sub_rule[1]] == "numeric")){ + sub_rule[2] = stringr::str_c("'", sub_rule[2], "'") + } + + rs = stringr::str_c(sub_rule, collapse = " == ") + } else { + # nothing to do + rs = string + } + + } # end of handle '=' case + + return(rs) + + } + + # clean up LHS as string + res[["LHS"]] = + purrr::map_chr(lhsStrings, getRuleString) %>% + stringr::str_c("( ", ., " )") %>% + stringr::str_c(collapse = " & ") # note spaces next to AND + } else { + + res[["LHS"]] = NA + + } + + # get RHS + # then might not exist: still retaining old name 'afterThen' + if (if_exists){ + afterThen = seq(which(stringr::str_trim(single_raw_rule) == "then") + 1, + length(single_raw_rule) + ) + } else { + afterThen = seq( + which(stringr::str_detect(stringr::str_trim(single_raw_rule), + "^outcome" + ) + ), + length(single_raw_rule) + ) + } + + # handle brackets around signs + res[["RHS"]] = + single_raw_rule[afterThen] %>% + stringr::str_replace_all("\\t", "") %>% + stringr::str_trim() %>% + stringr::str_c(collapse = " ") %>% + stringr::str_squish() %>% + stringr::str_replace("outcome = ", "") %>% + # remove spaces around +- signs + stringr::str_replace_all("\\s\\+\\s", "++") %>% + stringr::str_replace_all("\\s\\-\\s", "--") %>% + strReplaceReduce(variable_names, variable_names_with_) %>% + stringr::str_replace_all("\\s", " * ") %>% + stringr::str_replace_all("\\+\\+", ") + (") %>% + stringr::str_replace_all("\\-\\-", ") - (") + + # quotes aroud each addenum + res[["RHS"]] = + stringr::str_c("(", res[["RHS"]], ")") %>% + # honour negative intercept + stringr::str_replace("\\(\\)\\s\\-\\s\\(", "(-") + + return(res) + } + + # see if rules have commitees and create commitees vector + rule_number_splits = + stringr::str_split(stringr::str_trim(lev_2)[rule_starts], ":") %>% + purrr::map_chr(function(x) x[[1]]) %>% + stringr::str_split("\\s") %>% + purrr::map_chr(function(x) x[[2]]) %>% + stringr::str_split("/") %>% + simplify2array() %>% + as.integer() + + if (length(rule_number_splits) > length(rule_starts)){ + committees = + rule_number_splits[seq(1, by = 2, length.out = length(rule_starts))] + } else { + committees = rep(1L, length(rule_starts)) + } + + # create parsable rules from raw rules + res = + purrr::map(1:length(rule_starts), + function(i) lev_2[rule_starts[i]:rule_ends[i]] + ) %>% + purrr::map(get_rules_cubist) %>% + purrr::transpose() %>% + purrr::map(unlist) %>% + tidytable::as_tidytable() + + #### prepare and return ###################################################### + # replace variable names with spaces within backquotes + for (i in 1:length(variable_names)){ + res[["LHS"]] = + stringr::str_replace_all(res[["LHS"]], + variable_names[i], + addBackquotes(variable_names[i]) + ) + + res[["RHS"]] = stringr::str_replace_all(res[["RHS"]], + variable_names_with_[i], + addBackquotes(variable_names[i]) + ) + } + + res = + res %>% + tidytable::mutate(committee = local(committees)) %>% + tidytable::arrange(desc(error), .by = committee) %>% + tidytable::mutate(rule_nbr = tidytable::row_number(), .by = committee) + + res = + res %>% + select(committee, LHS, RHS, + support, mean, min, max, error + ) %>% + arrange(committee, error, desc(support)) %>% + mutate(rule_nbr = 1:n(), .by = committee) %>% + relocate(rule_nbr, committee) + + class(res) = c("rulelist", class(res)) + + attr(res, "keys") = "committee" + attr(res, "model_type") = "cubist" + attr(res, "estimation_type") = "regression" + + return(res) +} diff --git a/R/utils.R b/R/utils.R index 9901f5f..693faa5 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,9 +1,9 @@ -# utils ################################################################################ # This is the part of the 'tidyrules' R package hosted at # https://github.com/talegari/tidyrules with GPL-3 license. ################################################################################ +#' @keywords internal #' @name positionSpaceOutsideSinglequotes #' @title Position of space outside single quotes #' @description (vectorised) Detect the position of space in a string not within @@ -30,6 +30,7 @@ positionSpaceOutsideSinglequotes = Vectorize( USE.NAMES = FALSE ) +#' @keywords internal #' @name removeEmptyLines #' @title Remove empty lines #' @description Remove empty strings from a character vector @@ -44,6 +45,7 @@ removeEmptyLines = function(strings){ strings[!(strings == "")] } +#' @keywords internal #' @name strSplitSingle #' @title String split a string #' @description and return a character vector (not a list) @@ -62,12 +64,13 @@ strSplitSingle = function(string, pattern){ } +#' @keywords internal #' @name strHead #' @title Vectorized semantic equivalent of 'head' for a string #' @description Picks the substring starting from the first character #' @param string string #' @param n (integer) Number of characters -#' @details 'n' can be in the interval [-len + 1, len] (both ends inclusive) +#' @details 'n' can be in the interval \[-len + 1, len\] (both ends inclusive) #' @return A string #' @examples #' \donttest{ @@ -93,14 +96,13 @@ strHead = Vectorize( USE.NAMES = FALSE ) - - +#' @keywords internal #' @name strTail #' @title Vectorized semantic equivalent of tail for a string #' @description Picks the substring starting from the first character #' @param string string #' @param n (integer) Number of characters -#' @details 'n' can be in the interval [-len + 1, len] (both ends inclusive) +#' @details 'n' can be in the interval \[-len + 1, len\] (both ends inclusive) #' @return A string #' @examples #' \donttest{ @@ -125,6 +127,8 @@ strTail = Vectorize( vectorize.args = "string", USE.NAMES = FALSE ) + +#' @keywords internal #' @name addBackquotes #' @title Add backquotes #' @description (vectorized) Add backquotes when a string has a space in it @@ -150,6 +154,7 @@ addBackquotes = Vectorize( USE.NAMES = FALSE ) +#' @keywords internal #' @name strReplaceReduce #' @title Sequential string replace #' @description Sequential string replace via reduce @@ -175,3 +180,197 @@ strReplaceReduce = function(string, pattern, replacement){ .init = string ) } + +#' @keywords internal +#' @name varSpec +#' @title Get variable specification for a Cubist/C5 object +#' @description Obtain variable names, type (numeric, ordered, factor) and +#' levels as a tidytable +#' @param object Cubist/C5 object +#' @return A tidytable with three columns: variable(character), type(character) +#' and levels(a list-column). For numeric variables, levels are set to NA. +#' @examples +#' \dontrun{ +#' data("attrition", package = "modeldata") +#' cols_att = setdiff(colnames(attrition), c("MonthlyIncome", "Attrition")) +#' +#' cb_att = Cubist::cubist(x = attrition[, cols_att], +#' y = attrition[["MonthlyIncome"]] +#' ) +#' varSpec(cb_att) +#' } +varSpec = function(object){ + + # 1. split ny newline + # 2. remove a few header lines + # 3. get variables and details + + lines_raw = + object[["names"]] %>% + strSplitSingle("\\n") + + outcome_line_number = stringr::str_which(lines_raw, "^outcome:") + + lines = + lines_raw[-(1:outcome_line_number)] %>% + removeEmptyLines() + + split_lines = + lines %>% + stringr::str_split(":") %>% + purrr::transpose() + + variables = + split_lines %>% + magrittr::extract2(1) %>% + unlist() %>% + stringr::str_replace_all("\\\\", "") # clean up variable names + + details = + split_lines %>% + magrittr::extract2(2) %>% + unlist() %>% + stringr::str_trim() + + # handle a detail depending on its type + handleDetail = function(adetail){ + + if (adetail == "continuous."){ + # handle numeric/integer + out = list(type = "numeric", levels = NA_character_) + + } else if (stringr::str_detect(adetail, "^\\[ordered\\]")){ + # handle ordered factors + + levels = + adetail %>% + strSplitSingle("\\[ordered\\]") %>% + magrittr::extract(2) %>% + strHead(-1) %>% + strSplitSingle(",") %>% + stringr::str_trim() + + out = list(type = "ordered", levels = levels) + + } else { # handle unordered factors + + levels = + adetail %>% + strHead(-1) %>% + strSplitSingle(",") %>% + stringr::str_trim() + + out = list(type = "factor", levels = levels) + } + + return(out) + } + + details_cleaned = + details %>% + purrr::map(handleDetail) %>% + purrr::transpose() + + details_cleaned[["type"]] = unlist(details_cleaned[["type"]]) + details_cleaned[["variable"]] = variables + + res = tidytable::as_tidytable(details_cleaned) + return(res) +} + +#' @name convert_rule_flavor +#' @title Convert a R parsable rule to python/sql parsable rule +#' @description Convert a R parsable rule to python/sql parsable rule +#' @param rule (chr vector) R parsable rule(s) +#' @param flavor (string) One among: 'python', 'sql' +#' @return (chr vector) of rules +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist], [to_sql_case] +#' @family Auxiliary Rulelist Utility +#' @export +convert_rule_flavor = function(rule, flavor){ + + checkmate::assert_character(rule) + checkmate::assert_string(flavor) + flavor = stringr::str_to_lower(flavor) + checkmate::assert_choice(flavor, c("python", "sql")) + + if (flavor == "python"){ + res = + rule %>% + stringr::str_replace_all("\\( ", "") %>% + stringr::str_replace_all(" \\)", "") %>% + + stringr::str_replace_all("%in%", "in") %>% + stringr::str_replace_all("c\\(", "[") %>% + stringr::str_replace_all("\\)", "]") %>% + + stringr::str_replace_all("&", " ) and (") %>% + + stringr::str_c("( ", ., " )") %>% + stringr::str_squish() + + } else if (flavor == "sql"){ + res = + rule %>% + stringr::str_replace_all("\\( ", "") %>% + stringr::str_replace_all(" \\)", "") %>% + + stringr::str_replace_all("%in%", "IN") %>% + stringr::str_replace_all("c\\(", "[") %>% + stringr::str_replace_all("\\)", "]") %>% + + stringr::str_replace_all("&", " ) AND (") %>% + + stringr::str_c("( ", ., " )") %>% + stringr::str_squish() + } + + attr(res, "flavor") = flavor + return(res) +} + +#' @name to_sql_case +#' @title Extract SQL case statement from a [rulelist] +#' @description Extract SQL case statement from a [rulelist] +#' @param x A [rulelist] object +#' @param rhs_column_name (string, default: "RHS") Name of the column in the +#' rulelist to be used as RHS (WHEN THEN {rhs}) in the sql case +#' statement +#' @param output_colname (string, default: "output") Name of the output column +#' created by the SQL statement (used in case ... AS {output_column}) +#' @return (string invisibly) SQL case statement +#' @details As a side-effect, the SQL statement is cat to stdout. The output +#' contains newline character. +#' @examples +#' model_c5 = C50::C5.0(Attrition ~., data = modeldata::attrition, rules = TRUE) +#' tidy(model_c5) +#' to_sql_case(tidy(model_c5)) +#' @seealso [rulelist], [tidy], [augment][augment.rulelist], [predict][predict.rulelist], [convert_rule_flavor] +#' @family Auxiliary Rulelist Utility +#' @export +to_sql_case = function(x, + rhs_column_name = "RHS", + output_colname = "output" + ){ + + checkmate::assert_class(x, "rulelist") + rhs_is_string = inherits(x[[rhs_column_name]], c("character", "factor")) + lhs_sql = convert_rule_flavor(x$LHS, flavor = "sql") + out = "CASE" + + for (rn in seq_len(nrow(x))) { + + if (rhs_is_string) { + lhs = glue::glue("WHEN {lhs_sql[rn]} THEN '{x[[rhs_column_name]][rn]}'") + } else { + lhs = glue::glue("WHEN {lhs_sql[rn]} THEN {x[[rhs_column_name]][rn]}") + } + out = paste(out, lhs, sep = "\n") + } + out = paste(out, "ELSE NULL", sep = "\n") + out = paste(out, glue::glue("END AS {output_colname}"), sep = "\n") + + cli::cli_code(out, language = "SQL") + + return(invisible(out)) +} diff --git a/R/varSpec.R b/R/varSpec.R deleted file mode 100644 index 1116562..0000000 --- a/R/varSpec.R +++ /dev/null @@ -1,100 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -#' @name varSpec -#' @title Get variable specification for a Cubist/C5 object -#' @description Obtain variable names, type (numeric, ordered, factor) and -#' levels as a tidytable -#' @param object Cubist/C5 object -#' @return A tidytable with three columns: variable(character), type(character) -#' and levels(a list-column). For numeric variables, levels are set to NA. -#' @examples -#' \dontrun{ -#' data("attrition", package = "modeldata") -#' cols_att = setdiff(colnames(attrition), c("MonthlyIncome", "Attrition")) -#' -#' cb_att = Cubist::cubist(x = attrition[, cols_att], -#' y = attrition[["MonthlyIncome"]] -#' ) -#' varSpec(cb_att) -#' } -varSpec = function(object){ - - # 1. split ny newline - # 2. remove a few header lines - # 3. get variables and details - - lines_raw = - object[["names"]] %>% - strSplitSingle("\\n") - - outcome_line_number = stringr::str_which(lines_raw, "^outcome:") - - lines = - lines_raw[-(1:outcome_line_number)] %>% - removeEmptyLines() - - split_lines = - lines %>% - stringr::str_split(":") %>% - purrr::transpose() - - variables = - split_lines %>% - magrittr::extract2(1) %>% - unlist() %>% - stringr::str_replace_all("\\\\", "") # clean up variable names - - details = - split_lines %>% - magrittr::extract2(2) %>% - unlist() %>% - stringr::str_trim() - - # handle a detail depending on its type - handleDetail = function(adetail){ - - if (adetail == "continuous."){ - # handle numeric/integer - out = list(type = "numeric", levels = NA_character_) - - } else if (stringr::str_detect(adetail, "^\\[ordered\\]")){ - # handle ordered factors - - levels = - adetail %>% - strSplitSingle("\\[ordered\\]") %>% - magrittr::extract(2) %>% - strHead(-1) %>% - strSplitSingle(",") %>% - stringr::str_trim() - - out = list(type = "ordered", levels = levels) - - } else { # handle unordered factors - - levels = - adetail %>% - strHead(-1) %>% - strSplitSingle(",") %>% - stringr::str_trim() - - out = list(type = "factor", levels = levels) - } - - return(out) - } - - details_cleaned = - details %>% - purrr::map(handleDetail) %>% - purrr::transpose() - - details_cleaned[["type"]] = unlist(details_cleaned[["type"]]) - details_cleaned[["variable"]] = variables - - res = tidytable::as_tidytable(details_cleaned) - return(res) -} diff --git a/man/.DS_Store b/man/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 50", "var_2 < 30"), + RHS = c(2, 1) + ) +as_rulelist(rules_df, estimation_type = "regression") +} +\seealso{ +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict} + +Other Core Rulelist Utility: +\code{\link{as_rulelist}()}, +\code{\link{predict.rulelist}()}, +\code{\link{print.rulelist}()}, +\code{\link{set_keys}()} +} +\concept{Core Rulelist Utility} diff --git a/man/augment.Rd b/man/augment.Rd new file mode 100644 index 0000000..c79eb74 --- /dev/null +++ b/man/augment.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/augment.R +\name{augment} +\alias{augment} +\title{\code{augment} is re-export of \link[generics:augment]{generics::augment} from +\link[=package_tidyrules]{tidyrules} package} +\usage{ +augment(x, ...) +} +\arguments{ +\item{x}{A \link{rulelist}} + +\item{...}{For methods to use} +} +\description{ +See \link{augment.rulelist} +} +\seealso{ +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict} + +Other Augment: +\code{\link{augment.rulelist}()} +} +\concept{Augment} diff --git a/man/augment.rulelist.Rd b/man/augment.rulelist.Rd new file mode 100644 index 0000000..bf9ad13 --- /dev/null +++ b/man/augment.rulelist.Rd @@ -0,0 +1,176 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/augment.R +\name{augment.rulelist} +\alias{augment.rulelist} +\title{Augment a \link{rulelist}} +\usage{ +\method{augment}{rulelist}(x, new_data, y_name, weight = 1L, ...) +} +\arguments{ +\item{x}{A \link{rulelist}} + +\item{new_data}{(dataframe) with column named \code{y_name} present} + +\item{y_name}{(string) Column name representing the dependent variable} + +\item{weight}{(numeric, default: 1) Positive weight vector with length equal +to one or number of rows of 'new_data'} + +\item{...}{(expressions) To be send to \link[tidytable:summarize]{tidytable::summarise} for custom +aggregations. See examples.} +} +\value{ +A \link{rulelist} with a new dataframe-column named \code{augmented_stats}. +} +\description{ +\code{augment} outputs a \link{rulelist} with an additional column named +\code{augmented_stats} based on summary statistics calculated using \code{new_data}. +} +\details{ +The dataframe-column \code{augmented_stats} will have these columns +corresponding to the \code{estimation_type}: +\itemize{ +\item For \code{regression}: \code{support}, \code{IQR}, \code{RMSE} +\item For \code{classification}: \code{support}, \code{confidence}, \code{lift} +} + +All these metrics are computed in a weighted sense. Arg \code{weight} is 1 by +default. +} +\examples{ +# Examples for augment ------------------------------------------------------ +library("magrittr") + +# C5 ---- +att = modeldata::attrition +set.seed(100) +train_index = sample(c(TRUE, FALSE), nrow(att), replace = TRUE) + +model_c5 = C50::C5.0(Attrition ~., data = att[train_index, ], rules = TRUE) +tidy_c5 = tidy(model_c5) +tidy_c5 + +# augment +augmented = augment(tidy_c5, new_data = att[!train_index, ], y_name = "Attrition") + +augmented \%>\% + tidytable::unnest(augmented_stats, names_sep = "__") \%>\% + tidytable::glimpse() + +# augment with custom aggregator +augmented = + augment(tidy_c5, + new_data = att[!train_index, ], + y_name = "Attrition", + output_counts = list(table(Attrition)) + ) + +augmented \%>\% + tidytable::unnest(augmented_stats, names_sep = "__") \%>\% + tidytable::glimpse() + +# rpart ---- +set.seed(100) +train_index = sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) + +model_class_rpart = rpart::rpart(Species ~ ., data = iris[train_index, ]) +tidy_class_rpart = tidy(model_class_rpart) +tidy_class_rpart + +model_regr_rpart = rpart::rpart(Sepal.Length ~ ., data = iris[train_index, ]) +tidy_regr_rpart = tidy(model_regr_rpart) +tidy_regr_rpart + +#' augment (classification case) +augmented = + augment(tidy_class_rpart, + new_data = iris[!train_index, ], + y_name = "Species" + ) +augmented + +augmented \%>\% + tidytable::unnest(augmented_stats, names_sep = "__") \%>\% + tidytable::glimpse() + +#' augment (regression case) +augmented = + augment(tidy_regr_rpart, + new_data = iris[!train_index, ], + y_name = "Sepal.Length" + ) +augmented + +augmented \%>\% + tidytable::unnest(augmented_stats, names_sep = "__") \%>\% + tidytable::glimpse() + +# party ---- +pen = palmerpenguins::penguins +set.seed(100) +train_index = sample(c(TRUE, FALSE), nrow(pen), replace = TRUE) + +model_class_party = partykit::ctree(species ~ ., data = pen[train_index, ]) +tidy_class_party = tidy(model_class_party) +tidy_class_party + +model_regr_party = partykit::ctree(bill_length_mm ~ ., data = pen[train_index, ]) +tidy_regr_party = tidy(model_regr_party) +tidy_regr_party + +#' augment (classification case) +augmented = + augment(tidy_class_party, + new_data = pen[!train_index, ], + y_name = "species" + ) +augmented + +augmented \%>\% + tidytable::unnest(augmented_stats, names_sep = "__") \%>\% + tidytable::glimpse() + +#' augment (regression case) +augmented = + augment(tidy_regr_party, + new_data = tidytable::drop_na(pen[!train_index, ], bill_length_mm), + y_name = "bill_length_mm" + ) +augmented + +augmented \%>\% + tidytable::unnest(augmented_stats, names_sep = "__") \%>\% + tidytable::glimpse() + +# cubist ---- +att = modeldata::attrition +set.seed(100) +train_index = sample(c(TRUE, FALSE), nrow(att), replace = TRUE) +cols_att = setdiff(colnames(att), c("MonthlyIncome", "Attrition")) + +model_cubist = Cubist::cubist(x = att[train_index, cols_att], + y = att[train_index, "MonthlyIncome"] + ) + +tidy_cubist = tidy(model_cubist) +tidy_cubist + +augmented = + augment(tidy_cubist, + new_data = att[!train_index, ], + y_name = "MonthlyIncome" + ) +augmented + +augmented \%>\% + tidytable::unnest(augmented_stats, names_sep = "__") \%>\% + tidytable::glimpse() + +} +\seealso{ +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict} + +Other Augment: +\code{\link{augment}()} +} +\concept{Augment} diff --git a/man/augment_class_keys.Rd b/man/augment_class_keys.Rd new file mode 100644 index 0000000..715c90d --- /dev/null +++ b/man/augment_class_keys.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/augment.R +\name{augment_class_keys} +\alias{augment_class_keys} +\title{as the name says} +\usage{ +augment_class_keys(x, new_data, y_name, weight = 1L, ...) +} +\description{ +as the name says +not to be exported +} +\keyword{internal} diff --git a/man/augment_class_no_keys.Rd b/man/augment_class_no_keys.Rd new file mode 100644 index 0000000..628504d --- /dev/null +++ b/man/augment_class_no_keys.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/augment.R +\name{augment_class_no_keys} +\alias{augment_class_no_keys} +\title{as the name says} +\usage{ +augment_class_no_keys(x, new_data, y_name, weight = 1L, ...) +} +\description{ +as the name says +not to be exported +} +\keyword{internal} diff --git a/man/augment_regr_keys.Rd b/man/augment_regr_keys.Rd new file mode 100644 index 0000000..60d1463 --- /dev/null +++ b/man/augment_regr_keys.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/augment.R +\name{augment_regr_keys} +\alias{augment_regr_keys} +\title{as the name says} +\usage{ +augment_regr_keys(x, new_data, y_name, weight = 1L, ...) +} +\description{ +as the name says +not to be exported +} +\keyword{internal} diff --git a/man/augment_regr_no_keys.Rd b/man/augment_regr_no_keys.Rd new file mode 100644 index 0000000..b060b38 --- /dev/null +++ b/man/augment_regr_no_keys.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/augment.R +\name{augment_regr_no_keys} +\alias{augment_regr_no_keys} +\title{as the name says} +\usage{ +augment_regr_no_keys(x, new_data, y_name, weight = 1L, ...) +} +\description{ +as the name says +not to be exported +} +\keyword{internal} diff --git a/man/convert_rule_flavor.Rd b/man/convert_rule_flavor.Rd index 85b4be6..78f83f1 100644 --- a/man/convert_rule_flavor.Rd +++ b/man/convert_rule_flavor.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rule_translators.R +% Please edit documentation in R/utils.R \name{convert_rule_flavor} \alias{convert_rule_flavor} \title{Convert a R parsable rule to python/sql parsable rule} @@ -17,3 +17,10 @@ convert_rule_flavor(rule, flavor) \description{ Convert a R parsable rule to python/sql parsable rule } +\seealso{ +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict}, \link{to_sql_case} + +Other Auxiliary Rulelist Utility: +\code{\link{to_sql_case}()} +} +\concept{Auxiliary Rulelist Utility} diff --git a/man/package_tidyrules.Rd b/man/package_tidyrules.Rd index b9aee74..a874e35 100644 --- a/man/package_tidyrules.Rd +++ b/man/package_tidyrules.Rd @@ -5,17 +5,19 @@ \alias{tidyrules} \alias{tidyrules-package} \alias{package_tidyrules} -\title{About 'tidyrules' package} +\title{\code{tidyrules}} \description{ -Obtain rules as tidy dataframes +\code{tidyrules} package provides a framework to work with decision +rules stored as a \link{rulelist} backed by a tidy dataframe. Rules can be +extracted from supported models using \link{tidy}, augmented using validation data +by \link[=augment.rulelist]{augment}, manipulated using standard dataframe +operations, (modified) rulelists can be used to \link[=predict.rulelist]{predict} +on unseen (test) data. Utilities include: Create a rulelist +manually (\link[=as_rulelist.data.frame]{as_rulelist}), Export a rulelist to SQL +(\link{to_sql_case}) and so on. } \seealso{ -Useful links: -\itemize{ - \item \url{https://github.com/talegari/tidyrules} - \item Report bugs at \url{https://github.com/talegari/tidyrules/issues} -} - +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict} } \author{ \strong{Maintainer}: Srikanth Komala Sheshachala \email{sri.teach@gmail.com} diff --git a/man/positionSpaceOutsideSinglequotes.Rd b/man/positionSpaceOutsideSinglequotes.Rd index 83aa6e2..92ed3c6 100644 --- a/man/positionSpaceOutsideSinglequotes.Rd +++ b/man/positionSpaceOutsideSinglequotes.Rd @@ -14,7 +14,7 @@ A integer vector of positions } \description{ (vectorised) Detect the position of space in a string not within - a pair of single quotes +a pair of single quotes } \examples{ \donttest{ @@ -22,3 +22,4 @@ tidyrules:::positionSpaceOutsideSinglequotes(c("hello", "hel' 'o ")) } } +\keyword{internal} diff --git a/man/predict.rulelist.Rd b/man/predict.rulelist.Rd index fa818a0..5175313 100644 --- a/man/predict.rulelist.Rd +++ b/man/predict.rulelist.Rd @@ -1,40 +1,43 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ruleclasses.R +% Please edit documentation in R/rulelist.R \name{predict.rulelist} \alias{predict.rulelist} -\title{`predict` method for rulelist class} +\title{\code{predict} method for a \link{rulelist}} \usage{ -\method{predict}{rulelist}(object, new_data, raw = FALSE, ...) +\method{predict}{rulelist}(object, new_data, multiple = FALSE, ...) } \arguments{ -\item{object}{rulelist object} +\item{object}{A \link{rulelist}} -\item{new_data}{dataframe to predict} +\item{new_data}{(dataframe)} -\item{raw}{(flag, default: FALSE) Whether raw prediction are to be provided} +\item{multiple}{(flag, default: FALSE) Whether to output all rule numbers +applicable for a row. If FALSE, the first satisfying rule is provided.} \item{...}{unused} } \value{ -A dataframe indicating `rule_nbr` applicable for a `row_nbr` in - new_data +dataframe. See \strong{Details}. } \description{ -Returns the `rule_nbr` applicable for a `row_nbr` in new_data +Predicts \code{rule_nbr} applicable (as per the order in rulelist) +for a \code{row_nbr} (per key) in new_data } \details{ -If a `row_nbr` is covered more than one `rule_nbr` per 'keys', then -`rule_nbr` appearing in the earlier (as in row order) takes precedence. - -When raw is `FALSE`(default), output is a tidytable/dataframe with three or -more columns: `row_number` (int), columns corresponding to 'keys', `rule_nbr` -(int). If a row number is not covered by any rule, then there is one row with -all other columns other than `row_nbr` has a missing value. - -When raw is `TRUE`(default), output is a tidytable/dataframe with three or -more columns: `row_number` (int), columns corresponding to 'keys', `rule_nbr` -(list of intergers). If a row number is not covered by any rule, then there -is no row corresponding the `row_nbr`. +If a \code{row_nbr} is covered more than one \code{rule_nbr} per 'keys', then +\code{rule_nbr} appearing earlier (as in row order of the \link{rulelist}) takes +precedence. +\subsection{Output Format}{ +\itemize{ +\item When multiple is \code{FALSE}(default), output is a dataframe with three +or more columns: \code{row_number} (int), columns corresponding to 'keys', +\code{rule_nbr} (int). +\item When multiple is \code{TRUE}(default), output is a tidytable/dataframe with three +or more columns: \code{row_number} (int), columns corresponding to 'keys', +\code{rule_nbr} (list column of integers). +\item If a row number and 'keys' combination is not covered by any rule, then \code{rule_nbr} column has missing value. +} +} } \examples{ model_c5 = C50::C5.0(species ~., @@ -48,7 +51,16 @@ tidy_c5 output_1 = predict(tidy_c5, palmerpenguins::penguins) output_1 # different rules per 'keys' (`trial_nbr` here) -output_2 = predict(tidy_c5, palmerpenguins::penguins, raw = TRUE) +output_2 = predict(tidy_c5, palmerpenguins::penguins, multiple = TRUE) output_2 # `rule_nbr` is a list-column of integer vectors +} +\seealso{ +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict} +Other Core Rulelist Utility: +\code{\link{as_rulelist}()}, +\code{\link{as_rulelist.data.frame}()}, +\code{\link{print.rulelist}()}, +\code{\link{set_keys}()} } +\concept{Core Rulelist Utility} diff --git a/man/predict.ruleset.Rd b/man/predict.ruleset.Rd deleted file mode 100644 index 65422c6..0000000 --- a/man/predict.ruleset.Rd +++ /dev/null @@ -1,51 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ruleclasses.R -\name{predict.ruleset} -\alias{predict.ruleset} -\title{`predict` method for ruleset class} -\usage{ -\method{predict}{ruleset}(object, new_data, raw = FALSE, ...) -} -\arguments{ -\item{object}{ruleset object} - -\item{new_data}{dataframe to predict} - -\item{raw}{(flag, default: FALSE) Whether raw prediction are to be provided} - -\item{...}{unused} -} -\value{ -A dataframe indicating `rule_nbr` applicable for a `row_nbr` in - new_data -} -\description{ -Returns the `rule_nbr` applicable for a `row_nbr` in new_data -} -\details{ -A `row_nbr` is covered more than one `rule_nbr` per 'keys', results in error. - -When raw is `FALSE`(default), output is a tidytable/dataframe with three or -more columns: `row_number` (int), columns corresponding to`keys`, `rule_nbr` -(int). If a row number is not covered by any rule, then there is one row with -all other columns other than `row_nbr` has a missing value. - -When raw is `TRUE`(default), output is a tidytable/dataframe with three or -more columns: `row_number` (int), columns corresponding to`keys`, `rule_nbr` -(list of integers). If a row number is not covered by any rule, then there is -no row corresponding the `row_nbr`. -} -\examples{ -model_rpart = rpart::rpart(species ~ ., - data = palmerpenguins::penguins - ) -tidy_rpart = tidy(model_rpart) -tidy_rpart - -output_1 = predict(tidy_rpart, palmerpenguins::penguins) -output_1 - -output_2 = predict(tidy_rpart, palmerpenguins::penguins, raw = TRUE) -output_2 # `rule_nbr` is a list-column of integer vectors - -} diff --git a/man/predict_all_nokeys_rulelist.Rd b/man/predict_all_nokeys_rulelist.Rd new file mode 100644 index 0000000..e3cf8a9 --- /dev/null +++ b/man/predict_all_nokeys_rulelist.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rulelist.R +\name{predict_all_nokeys_rulelist} +\alias{predict_all_nokeys_rulelist} +\title{as the name says} +\usage{ +predict_all_nokeys_rulelist(rulelist, new_data) +} +\arguments{ +\item{rulelist}{rulelist} + +\item{new_data}{new_data} +} +\value{ +dataframe +} +\description{ +as the name says +} +\keyword{internal} diff --git a/man/predict_all_rulelist.Rd b/man/predict_all_rulelist.Rd new file mode 100644 index 0000000..8e57ab4 --- /dev/null +++ b/man/predict_all_rulelist.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rulelist.R +\name{predict_all_rulelist} +\alias{predict_all_rulelist} +\title{with or without keys} +\usage{ +predict_all_rulelist(rulelist, new_data) +} +\arguments{ +\item{rulelist}{rulelist} + +\item{new_data}{new_data} +} +\value{ +dataframe +} +\description{ +uses predict_all_nokeys_rulelist +} +\keyword{internal} diff --git a/man/predict_core.Rd b/man/predict_core.Rd deleted file mode 100644 index 97e7b0c..0000000 --- a/man/predict_core.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ruleclasses.R -\name{predict_core} -\alias{predict_core} -\title{Core predict for ruleset/list/tidy set of rules} -\usage{ -predict_core(rules_df, new_data) -} -\arguments{ -\item{rules_df}{dataframe with at least two columns: `rule_nbr`, `LHS`. -Should have 'keys' columns such that `rule_nbr` along with 'keys' columns -form a unique combo per row} - -\item{new_data}{Data to predict on} -} -\value{ -dataframe with these columns: `row_nbr` (integer), 'keys' columns, - `rule_nbr` (list of integers) -} -\description{ -Core logic of predict method is written in a generic sense. This - function will not be exposed at user level. -} -\details{ -If a row number is not covered under any rule, then it does not - appear as a row in the output. -} -\examples{ -\dontrun{ -library("magrittr") - -# ruleset case -rpart::rpart(Species ~ .,data = iris) \%>\% - tidy() \%>\% - dplyr::select(rule_nbr, LHS) \%>\% - predict_core(iris) - -# rulelist case -C50::C5.0(species ~., - data = palmerpenguins::penguins, - trials = 5, - rules = TRUE - ) \%>\% - tidy() \%>\% - dplyr::select(rule_nbr, trial_nbr, LHS) \%>\% - predict_core(palmerpenguins::penguins) -} -} diff --git a/man/predict_nokeys_rulelist.Rd b/man/predict_nokeys_rulelist.Rd new file mode 100644 index 0000000..c954f78 --- /dev/null +++ b/man/predict_nokeys_rulelist.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rulelist.R +\name{predict_nokeys_rulelist} +\alias{predict_nokeys_rulelist} +\title{as the name says} +\usage{ +predict_nokeys_rulelist(rulelist, new_data) +} +\arguments{ +\item{rulelist}{rulelist} + +\item{new_data}{new_data} +} +\value{ +dataframe +} +\description{ +as the name says +} +\keyword{internal} diff --git a/man/predict_rulelist.Rd b/man/predict_rulelist.Rd new file mode 100644 index 0000000..f1c6ed2 --- /dev/null +++ b/man/predict_rulelist.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rulelist.R +\name{predict_rulelist} +\alias{predict_rulelist} +\title{with or without keys} +\usage{ +predict_rulelist(rulelist, new_data) +} +\arguments{ +\item{rulelist}{rulelist} + +\item{new_data}{new_data} +} +\value{ +dataframe +} +\description{ +uses predict_nokeys_rulelist +} +\keyword{internal} diff --git a/man/print.rulelist.Rd b/man/print.rulelist.Rd index e7702f0..2718106 100644 --- a/man/print.rulelist.Rd +++ b/man/print.rulelist.Rd @@ -1,19 +1,29 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ruleclasses.R +% Please edit documentation in R/rulelist.R \name{print.rulelist} \alias{print.rulelist} -\title{Print method for rulelist class} +\title{Print method for \link{rulelist} class} \usage{ \method{print}{rulelist}(x, ...) } \arguments{ -\item{x}{A rulelist object} +\item{x}{A \link{rulelist} object} -\item{...}{Passed to `tidytable::print`} +\item{...}{Passed to \code{tidytable::print}} } \value{ -Input (invisibly) +input \link{rulelist} (invisibly) } \description{ -Prints 'keys' and rulelist as a tidytable +Prints \link{rulelist} attributes and first few rows. } +\seealso{ +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict} + +Other Core Rulelist Utility: +\code{\link{as_rulelist}()}, +\code{\link{as_rulelist.data.frame}()}, +\code{\link{predict.rulelist}()}, +\code{\link{set_keys}()} +} +\concept{Core Rulelist Utility} diff --git a/man/print.ruleset.Rd b/man/print.ruleset.Rd deleted file mode 100644 index 3708b60..0000000 --- a/man/print.ruleset.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ruleclasses.R -\name{print.ruleset} -\alias{print.ruleset} -\title{Print method for ruleset class} -\usage{ -\method{print}{ruleset}(x, ...) -} -\arguments{ -\item{x}{A ruleset object} - -\item{...}{Passed to `tidytable::print`} -} -\value{ -Input (invisibly) -} -\description{ -Prints 'keys' and ruleset as a tidytable -} diff --git a/man/reexports.Rd b/man/reexports.Rd deleted file mode 100644 index 9b6c624..0000000 --- a/man/reexports.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/generic.R -\docType{import} -\name{reexports} -\alias{reexports} -\alias{tidy} -\title{Objects exported from other packages} -\keyword{internal} -\description{ -These objects are imported from other packages. Follow the links -below to see their documentation. - -\describe{ - \item{generics}{\code{\link[generics]{tidy}}} -}} - diff --git a/man/removeEmptyLines.Rd b/man/removeEmptyLines.Rd index 2118bcc..43e220e 100644 --- a/man/removeEmptyLines.Rd +++ b/man/removeEmptyLines.Rd @@ -21,3 +21,4 @@ tidyrules:::removeEmptyLines(c("abc", "", "d")) } } +\keyword{internal} diff --git a/man/rulelist.Rd b/man/rulelist.Rd new file mode 100644 index 0000000..f3516c6 --- /dev/null +++ b/man/rulelist.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rulelist.R +\name{rulelist} +\alias{rulelist} +\title{Rulelist} +\description{ +\subsection{Structure}{ + +A \code{rulelist} is ordered list of rules stored as a dataframe. Each row, +specifies a rule (LHS), expected outcome (RHS) and some other details. + +It has these mandatory columns: +\itemize{ +\item \code{rule_nbr}: (integer vector) Rule number +\item \code{LHS}: (character vector) A rule is a string that can be parsed using \code{\link[base:parse]{base::parse()}} +\item \code{RHS}: (character vector or a literal) +} +} + +\subsection{Example}{ + +\if{html}{\out{
}}\preformatted{| rule_nbr|LHS |RHS | support| confidence| lift| +|--------:|:--------------------------------------------------------------------|:---------|-------:|----------:|--------:| +| 1|( island \%in\% c('Biscoe') ) & ( flipper_length_mm > 203 ) |Gentoo | 122| 1.0000000| 2.774193| +| 2|( island \%in\% c('Biscoe') ) & ( flipper_length_mm <= 203 ) |Adelie | 46| 0.9565217| 2.164760| +| 3|( island \%in\% c('Dream', 'Torgersen') ) & ( bill_length_mm > 44.1 ) |Chinstrap | 65| 0.9538462| 4.825339| +| 4|( island \%in\% c('Dream', 'Torgersen') ) & ( bill_length_mm <= 44.1 ) |Adelie | 111| 0.9459459| 2.140825| +}\if{html}{\out{
}} +} + +\subsection{Create a rulelist}{ + +A \code{rulelist} can be created using \code{\link[=tidy]{tidy()}} on some supported model fits +(run: \code{utils::methods(tidy)}). It can also be created manually from a +existing dataframe using \link[=as_rulelist.data.frame]{as_rulelist}. +} + +\subsection{Keys and attributes}{ + +Columns identified as 'keys' along with \code{rule_nbr} form a unique +combination +-- a group of rules. For example, rule-based C5 model with multiple trials +creates rules per each \code{trial_nbr}. \code{predict} method understands 'keys', +thereby provides/predicts a rule number (for each row in new data / test +data) within the same \code{trial_nbr}. + +A rulelist has these mandatory attributes: +\itemize{ +\item \code{estimation_type}: One among \code{regression}, \code{classification} + +A rulelist has these optional attributes: +\item \code{keys}: (character vector)Names of the column that forms a key. +\item \code{model_type}: (string) Name of the model +} +} + +\subsection{Methods for rulelist}{ +\enumerate{ +\item \link[=predict.rulelist]{Predict}: Given a dataframe (possibly without a +dependent variable column aka 'test data'), predicts the first rule (as +ordered in the rulelist) per 'keys' that is applicable for each row. When +\code{multiple = TRUE}, returns all rules applicable for a row (per key). +\item \link[=augment.rulelist]{Augment}: Given a dataframe (with dependent variable +column, aka validation data), creates summary statistics per rule and +returns a rulelist with a new dataframe-column. +} +} + +\subsection{Manipulating a rulelist}{ + +Rulelists are essentially dataframes. Hence, any dataframe operations which +preferably preserve attributes will output a rulelist. \link{as_rulelist} and +\link{as.data.frame} will help in moving back and forth between rulelist and +dataframe worlds. +} + +\subsection{Utilities for a rulelist}{ +\enumerate{ +\item \link[=as_rulelist.data.frame]{as_rulelist}: Create a \code{rulelist} from a +dataframe with some mandatory columns. 2. \link{set_keys}: Set or Unset 'keys' +of a \code{rulelist}. 3. \link{to_sql_case}: Outputs a SQL case statement for a +\code{rulelist}. 4. \link{convert_rule_flavor}: Converts \code{R}-parsable rule strings to +python/SQL parsable rule strings. +} +} +} +\seealso{ +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, +\link[=predict.rulelist]{predict} +} diff --git a/man/set_keys.Rd b/man/set_keys.Rd new file mode 100644 index 0000000..d6c535b --- /dev/null +++ b/man/set_keys.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rulelist.R +\name{set_keys} +\alias{set_keys} +\title{Set keys for a \link{rulelist}} +\usage{ +set_keys(x, keys) +} +\arguments{ +\item{x}{A \link{rulelist}} + +\item{keys}{(character vector or NULL)} +} +\value{ +A \link{rulelist} object +} +\description{ +'keys' are a set of column(s) whose unique combination +identifies a group of rules in a \link{rulelist}. Methods like +\link{predict.rulelist}, \link{augment.rulelist} produce output per key combination. +} +\details{ +A new \link{rulelist} is returned with attr \code{keys} is modified. The input +\link{rulelist} object is unaltered. +} +\examples{ +model_c5 = C50::C5.0(Attrition ~., data = modeldata::attrition, rules = TRUE) +tidy_c5 = tidy(model_c5) +tidy_c5 # keys are: "trial_nbr" + +new_tidy_c5 = set_keys(tidy_c5, NULL) # remove all keys +new_tidy_c5 +new_2_tidy_c5 = set_keys(new_tidy_c5, "trial_nbr") # set "trial_nbr" as key +new_2_tidy_c5 + +# Note that `tidy_c5` and `new_tidy_c5` are not altered. +tidy_c5 +new_tidy_c5 +} +\seealso{ +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict} + +Other Core Rulelist Utility: +\code{\link{as_rulelist}()}, +\code{\link{as_rulelist.data.frame}()}, +\code{\link{predict.rulelist}()}, +\code{\link{print.rulelist}()} +} +\concept{Core Rulelist Utility} diff --git a/man/strHead.Rd b/man/strHead.Rd index 10e660e..1a7e9dd 100644 --- a/man/strHead.Rd +++ b/man/strHead.Rd @@ -27,3 +27,4 @@ tidyrules:::strHead(c("string", "string2"), -1) } } +\keyword{internal} diff --git a/man/strReplaceReduce.Rd b/man/strReplaceReduce.Rd index 9df9b6b..9e50458 100644 --- a/man/strReplaceReduce.Rd +++ b/man/strReplaceReduce.Rd @@ -25,3 +25,4 @@ tidyrules:::strReplaceReduce("abcd", c("ab", "dc"), c("cd", "ab")) } } +\keyword{internal} diff --git a/man/strSplitSingle.Rd b/man/strSplitSingle.Rd index ba9f1cf..833f2b8 100644 --- a/man/strSplitSingle.Rd +++ b/man/strSplitSingle.Rd @@ -23,3 +23,4 @@ tidyrules:::strSplitSingle("abc,d", ",") } } +\keyword{internal} diff --git a/man/strTail.Rd b/man/strTail.Rd index 76cbeca..3422fcf 100644 --- a/man/strTail.Rd +++ b/man/strTail.Rd @@ -27,3 +27,4 @@ tidyrules:::strTail(c("string", "string2"), -1) } } +\keyword{internal} diff --git a/man/tidy.C5.0.Rd b/man/tidy.C5.0.Rd index dfdc3a5..8a18370 100644 --- a/man/tidy.C5.0.Rd +++ b/man/tidy.C5.0.Rd @@ -1,35 +1,50 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/c5.R +% Please edit documentation in R/tidy.R \name{tidy.C5.0} \alias{tidy.C5.0} -\title{Obtain rules as rulelist/tiydtable from a C5.0 model} +\title{Get the \link{rulelist} from a \link[C50:C5.0]{C5} model} \usage{ \method{tidy}{C5.0}(x, ...) } \arguments{ -\item{x}{C5 model fitted with `rules = TRUE`} +\item{x}{\link[C50:C5.0]{C50::C5.0} model fitted with \code{rules = TRUE}} \item{...}{Other arguments (See details)} } \value{ -A rulelist/tidytable where each row corresponds to a rule. - The columns are: rule_nbr, trial_nbr, LHS, RHS, support, confidence, lift +A \link{rulelist} object } \description{ -Each row corresponds to a rule per trial_nbr +Each row corresponds to a rule per \code{trial_nbr} } \details{ -Optional named arguments: - \itemize{ +\item The output columns are: \code{rule_nbr}, \code{trial_nbr}, \code{LHS}, \code{RHS}, +\code{support}, \code{confidence}, \code{lift}. +\item Rules per \code{trial_nbr} are sorted in this order: \code{desc(confidence)}, +\code{desc(lift)}, \code{desc(support)}. +} -\item laplace(flag, default: TRUE) is supported. This computes confidence -with laplace correction as documented under 'Rulesets' here: [C5 -doc](https://www.rulequest.com/see5-unix.html). +Optional named arguments: +\itemize{ +\item \code{laplace} (flag, default: TRUE) is supported. This +computes confidence with laplace correction as documented under 'Rulesets' +here: \href{https://www.rulequest.com/see5-unix.html}{C5 doc}. } } \examples{ -c5_model = C50::C5.0(Attrition ~., data = modeldata::attrition, rules = TRUE) -summary(c5_model) -tidy(c5_model) +model_c5 = C50::C5.0(Attrition ~., data = modeldata::attrition, rules = TRUE) +tidy(model_c5) + +} +\seealso{ +\link{tidy}, \link{tidy.C5.0}, \link{tidy.rpart}, \link{tidy.constparty}, \link{tidy.cubist}, +\link{rulelist}, \link{augment.rulelist}, \link{predict.rulelist} + +Other Core Tidy Utility: +\code{\link{tidy}()}, +\code{\link{tidy.constparty}()}, +\code{\link{tidy.cubist}()}, +\code{\link{tidy.rpart}()} } +\concept{Core Tidy Utility} diff --git a/man/tidy.Rd b/man/tidy.Rd new file mode 100644 index 0000000..616bb1e --- /dev/null +++ b/man/tidy.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidy.R +\name{tidy} +\alias{tidy} +\title{\code{tidy} is re-export of \link[generics:tidy]{generics::tidy} from +\link[=package_tidyrules]{tidyrules} package} +\usage{ +tidy(x, ...) +} +\arguments{ +\item{x}{A supported model object} + +\item{...}{For model specific implementations to use} +} +\description{ +\code{tidy} applied on a supported model fit creates a \link{rulelist}. +\strong{See Also} section links to documentation of specific methods. +} +\seealso{ +\link{tidy}, \link{tidy.C5.0}, \link{tidy.rpart}, \link{tidy.constparty}, \link{tidy.cubist}, +\link{rulelist}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict} + +Other Core Tidy Utility: +\code{\link{tidy.C5.0}()}, +\code{\link{tidy.constparty}()}, +\code{\link{tidy.cubist}()}, +\code{\link{tidy.rpart}()} +} +\concept{Core Tidy Utility} diff --git a/man/tidy.constparty.Rd b/man/tidy.constparty.Rd index 75ee7de..3938991 100644 --- a/man/tidy.constparty.Rd +++ b/man/tidy.constparty.Rd @@ -1,37 +1,54 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/party.R +% Please edit documentation in R/tidy.R \name{tidy.constparty} \alias{tidy.constparty} -\title{Obtain rules as a ruleset/tidytable from a party model} +\title{Get the \link{rulelist} from a \link[partykit:party]{party} model} \usage{ \method{tidy}{constparty}(x, ...) } \arguments{ -\item{x}{party model} +\item{x}{\link[partykit:party]{partykit::party} model typically built using \link[partykit:ctree]{partykit::ctree}} \item{...}{Other arguments (currently unused)} } \value{ -A tidytable where each row corresponds to a rule. The columns are: - rule_nbr, LHS, RHS, support, confidence (for classification only), lift - (for classification only) +A \link{rulelist} object } \description{ -Each row corresponds to a rule. A rule can be copied into - `dplyr::filter` to filter the observations corresponding to a rule +Each row corresponds to a rule } \details{ -These party models are supported: regression (y is numeric), - classification (y is factor) +These types of \link[partykit:party]{party} models are supported: +\code{regression} (y is numeric), \code{classification} (y is factor) + +For \link[partykit:party]{party} classification model: +\itemize{ +\item Output columns are: \code{rule_nbr}, \code{LHS}, \code{RHS}, \code{support}, \code{confidence}, \code{lift}, \code{terminal_node_id}. +\item Rules are sorted in this order: \code{desc(confidence)}, \code{desc(lift)}, +\code{desc(support)}. +} + +For \link[partykit:party]{party} regression model: +\itemize{ +\item Output columns are: \code{rule_nbr}, \code{LHS}, \code{RHS}, \code{support}, \code{IQR}, \code{RMSE}, \code{terminal_node_id}. +\item Rules are sorted in this order: \code{RMSE}, \code{desc(support)}. +} } \examples{ -model_party_cl = partykit::ctree(species ~ .,data = palmerpenguins::penguins) -model_party_cl -tidy(model_party_cl) +pen = palmerpenguins::penguins +model_class_party = partykit::ctree(species ~ ., data = pen) +tidy(model_class_party) +model_regr_party = partykit::ctree(bill_length_mm ~ ., data = pen) +tidy(model_regr_party) +} +\seealso{ +\link{tidy}, \link{tidy.C5.0}, \link{tidy.rpart}, \link{tidy.constparty}, \link{tidy.cubist}, +\link{rulelist}, \link{augment.rulelist}, \link{predict.rulelist} -model_party_re = partykit::ctree(bill_length_mm ~ ., - data = palmerpenguins::penguins - ) -model_party_re -tidy(model_party_re) +Other Core Tidy Utility: +\code{\link{tidy}()}, +\code{\link{tidy.C5.0}()}, +\code{\link{tidy.cubist}()}, +\code{\link{tidy.rpart}()} } +\concept{Core Tidy Utility} diff --git a/man/tidy.cubist.Rd b/man/tidy.cubist.Rd index 48eb23e..0f68970 100644 --- a/man/tidy.cubist.Rd +++ b/man/tidy.cubist.Rd @@ -1,35 +1,45 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cubist.R +% Please edit documentation in R/tidy.R \name{tidy.cubist} \alias{tidy.cubist} -\title{Obtain rules as a ruleset/tidytable from a cubist model} +\title{Get the \link{rulelist} from a \link[Cubist:cubist.default]{cubist} model} \usage{ \method{tidy}{cubist}(x, ...) } \arguments{ -\item{x}{Cubist model} +\item{x}{\link[Cubist:cubist.default]{Cubist::cubist} model} \item{...}{Other arguments (currently unused)} } \value{ -A ruleset/tidytable where each row corresponds to a rule. The columns - are: rule_nbr, committee, LHS, RHS, support, mean, min, max, error +A \link{rulelist} object } \description{ -Each row corresponds to a rule per committee. +Each row corresponds to a rule per \code{committee} } \details{ -When col_classes argument is missing, an educated guess is made - about class by parsing the RHS of sub-rule. This might sometimes not lead - to a parsable rule. +\itemize{ +\item The output columns are: \code{rule_nbr}, \code{committee}, \code{LHS}, \code{RHS}, \code{support}, \code{mean}, \code{min}, \code{max}, \code{error}. +\item Rules are sorted in this order per committee: +\code{error}, \code{desc(support)} +} } \examples{ -data("attrition", package = "modeldata") -cols_att = setdiff(colnames(attrition), c("MonthlyIncome", "Attrition")) +att = modeldata::attrition +cols_att = setdiff(colnames(att), c("MonthlyIncome", "Attrition")) +model_cubist = Cubist::cubist(x = att[, cols_att], + y = att[["MonthlyIncome"]] + ) +tidy(model_cubist) +} +\seealso{ +\link{tidy}, \link{tidy.C5.0}, \link{tidy.rpart}, \link{tidy.constparty}, \link{tidy.cubist}, +\link{rulelist}, \link{augment.rulelist}, \link{predict.rulelist} -cb_att = Cubist::cubist(x = attrition[, cols_att], - y = attrition[["MonthlyIncome"]] - ) -summary(cb_att) -tidy(cb_att) +Other Core Tidy Utility: +\code{\link{tidy}()}, +\code{\link{tidy.C5.0}()}, +\code{\link{tidy.constparty}()}, +\code{\link{tidy.rpart}()} } +\concept{Core Tidy Utility} diff --git a/man/tidy.rpart.Rd b/man/tidy.rpart.Rd index c2b1011..9036f3c 100644 --- a/man/tidy.rpart.Rd +++ b/man/tidy.rpart.Rd @@ -1,36 +1,53 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rpart.R +% Please edit documentation in R/tidy.R \name{tidy.rpart} \alias{tidy.rpart} -\title{Obtain rules as a ruleset/tidytable from a rpart model} +\title{Get the \link{rulelist} from a \link[rpart:rpart]{rpart} model} \usage{ \method{tidy}{rpart}(x, ...) } \arguments{ -\item{x}{rpart model} +\item{x}{\link[rpart:rpart]{rpart::rpart} model} \item{...}{Other arguments (currently unused)} } \value{ -A tidytable where each row corresponds to a rule. The columns are: - rule_nbr, LHS, RHS, support, confidence (for classification only), lift - (for classification only) +A \link{rulelist} object } \description{ -Each row corresponds to a rule. A rule can be copied into - `dplyr::filter` to filter the observations corresponding to a rule +Each row corresponds to a rule } \details{ -NOTE: For rpart rules, one should build the model without -\bold{ordered factor} variable. We recommend you to convert \bold{ordered -factor} to \bold{factor} or \bold{integer} class. +For rpart rules, one should build the model without \link[base:factor]{ordered factor} variable. We recommend you to convert \link[base:factor]{ordered factor} to \link[base:factor]{factor} or \emph{integer} class. + +For \link[rpart:rpart]{rpart::rpart} classification model: +\itemize{ +\item Output columns are: \code{rule_nbr}, \code{LHS}, \code{RHS}, \code{support}, \code{confidence}, \code{lift}. +\item The rules are sorted in this order: \code{desc(confidence)}, \code{desc(lift)}, +\code{desc(support)}. +} + +For \link[rpart:rpart]{rpart::rpart} regression(anova) model: +\itemize{ +\item Output columns are: \code{rule_nbr}, \code{LHS}, \code{RHS}, \code{support}. +\item The rules are sorted in this order: \code{desc(support)}. +} } \examples{ -rpart_class = rpart::rpart(Species ~ .,data = iris) -rpart_class -tidy(rpart_class) +model_class_rpart = rpart::rpart(Species ~ ., data = iris) +tidy(model_class_rpart) + +model_regr_rpart = rpart::rpart(Sepal.Length ~ ., data = iris) +tidy(model_regr_rpart) +} +\seealso{ +\link{tidy}, \link{tidy.C5.0}, \link{tidy.rpart}, \link{tidy.constparty}, \link{tidy.cubist}, +\link{rulelist}, \link{augment.rulelist}, \link{predict.rulelist} -rpart_regr = rpart::rpart(Sepal.Length ~ .,data = iris) -rpart_regr -tidy(rpart_regr) +Other Core Tidy Utility: +\code{\link{tidy}()}, +\code{\link{tidy.C5.0}()}, +\code{\link{tidy.constparty}()}, +\code{\link{tidy.cubist}()} } +\concept{Core Tidy Utility} diff --git a/man/to_sql_case.Rd b/man/to_sql_case.Rd new file mode 100644 index 0000000..efd2ab4 --- /dev/null +++ b/man/to_sql_case.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{to_sql_case} +\alias{to_sql_case} +\title{Extract SQL case statement from a \link{rulelist}} +\usage{ +to_sql_case(x, rhs_column_name = "RHS", output_colname = "output") +} +\arguments{ +\item{x}{A \link{rulelist} object} + +\item{rhs_column_name}{(string, default: "RHS") Name of the column in the +rulelist to be used as RHS (WHEN \if{html}{\out{}} THEN {rhs}) in the sql case +statement} + +\item{output_colname}{(string, default: "output") Name of the output column +created by the SQL statement (used in case ... AS {output_column})} +} +\value{ +(string invisibly) SQL case statement +} +\description{ +Extract SQL case statement from a \link{rulelist} +} +\details{ +As a side-effect, the SQL statement is cat to stdout. The output +contains newline character. +} +\examples{ +model_c5 = C50::C5.0(Attrition ~., data = modeldata::attrition, rules = TRUE) +tidy(model_c5) +to_sql_case(tidy(model_c5)) +} +\seealso{ +\link{rulelist}, \link{tidy}, \link[=augment.rulelist]{augment}, \link[=predict.rulelist]{predict}, \link{convert_rule_flavor} + +Other Auxiliary Rulelist Utility: +\code{\link{convert_rule_flavor}()} +} +\concept{Auxiliary Rulelist Utility} diff --git a/man/varSpec.Rd b/man/varSpec.Rd index 85e3870..d199b41 100644 --- a/man/varSpec.Rd +++ b/man/varSpec.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/varSpec.R +% Please edit documentation in R/utils.R \name{varSpec} \alias{varSpec} \title{Get variable specification for a Cubist/C5 object} @@ -11,11 +11,11 @@ varSpec(object) } \value{ A tidytable with three columns: variable(character), type(character) - and levels(a list-column). For numeric variables, levels are set to NA. +and levels(a list-column). For numeric variables, levels are set to NA. } \description{ Obtain variable names, type (numeric, ordered, factor) and - levels as a tidytable +levels as a tidytable } \examples{ \dontrun{ @@ -28,3 +28,4 @@ cb_att = Cubist::cubist(x = attrition[, cols_att], varSpec(cb_att) } } +\keyword{internal} diff --git a/tests/testthat/test-c5.R b/tests/testthat/test-c5.R index 3e3cafc..2bd630c 100644 --- a/tests/testthat/test-c5.R +++ b/tests/testthat/test-c5.R @@ -47,7 +47,7 @@ allRulesFilterable = function(tr, data){ , silent = TRUE ) if(nrow(trydf) == 0){ - print(arule) + # print(arule) } inherits(trydf, "data.frame") } diff --git a/tests/testthat/test-cubist.R b/tests/testthat/test-cubist.R index 3f60e3f..dfdfe89 100644 --- a/tests/testthat/test-cubist.R +++ b/tests/testthat/test-cubist.R @@ -55,7 +55,7 @@ allRulesFilterable = function(tr, data){ , silent = TRUE ) if(nrow(trydf) == 0){ - print(arule) + #print(arule) } inherits(trydf, "data.frame") && (nrow(trydf) > 0) } @@ -79,17 +79,17 @@ evalRHS = function(tr, data){ , USE.NAMES = FALSE ) - print(which(!with_RHS)) + # print(which(!with_RHS)) return(all(with_RHS)) } # test output type ---- test_that("creates tibble", { - expect_is(tr_att, "ruleset") - expect_is(tr_att_2, "ruleset") - expect_is(tr_ames, "ruleset") - expect_is(tr_boston, "ruleset") + expect_is(tr_att, "rulelist") + expect_is(tr_att_2, "rulelist") + expect_is(tr_ames, "rulelist") + expect_is(tr_boston, "rulelist") }) # test NA ---- diff --git a/tests/testthat/test-party.R b/tests/testthat/test-party.R index 97e57e5..1020870 100644 --- a/tests/testthat/test-party.R +++ b/tests/testthat/test-party.R @@ -38,9 +38,9 @@ allRulesFilterable = function(tr, data){ # test output type ---- -test_that("creates ruleset", { - expect_is(tidy(model_party_cl), "ruleset") - expect_is(tidy(model_party_re), "ruleset") +test_that("creates rulelist", { + expect_is(tidy(model_party_cl), "rulelist") + expect_is(tidy(model_party_re), "rulelist") }) # test parsable ---- diff --git a/tests/testthat/test-rpart.R b/tests/testthat/test-rpart.R index 1195cba..deb75bf 100644 --- a/tests/testthat/test-rpart.R +++ b/tests/testthat/test-rpart.R @@ -74,11 +74,11 @@ test_that("check error",{ # test output type ---- -test_that("creates ruleset", { - expect_is(tr_att_class, "ruleset") - expect_is(tr_bc_1, "ruleset") - expect_is(tr_bc_2, "ruleset") - expect_is(tr_att_reg, "ruleset") +test_that("creates rulelist", { + expect_is(tr_att_class, "rulelist") + expect_is(tr_bc_1, "rulelist") + expect_is(tr_bc_2, "rulelist") + expect_is(tr_att_reg, "rulelist") }) # test NA ---- diff --git a/tests/testthat/test-ruleset.R b/tests/testthat/test-ruleset.R deleted file mode 100644 index 3b9b722..0000000 --- a/tests/testthat/test-ruleset.R +++ /dev/null @@ -1,27 +0,0 @@ -################################################################################ -# This is the part of the 'tidyrules' R package hosted at -# https://github.com/talegari/tidyrules with GPL-3 license. -################################################################################ - -context("test-ruleset") - -model_rpart = rpart::rpart(species ~ ., - data = palmerpenguins::penguins - ) -tidy_rpart = tidy(model_rpart) -tidy_rpart - -output_1 = predict(tidy_rpart, palmerpenguins::penguins) -output_1 - -output_2 = predict(tidy_rpart, palmerpenguins::penguins, raw = TRUE) -output_2 # `rule_nbr` is a list-column of integer vectors - -test_that("creates a dataframe", { - expect_is(output_1, "data.frame") - expect_is(output_2, "data.frame") -}) - -test_that("should not miss any row_nbr", { - expect_true(all(1:nrow(palmerpenguins::penguins) %in% output_1$row_nbr)) -}) \ No newline at end of file