From eeb315f44ed76983f5a46e21a324a688700e9c6f Mon Sep 17 00:00:00 2001 From: blas Date: Thu, 7 Dec 2023 20:45:27 +0100 Subject: [PATCH] version 1.1.1 --- CRAN-SUBMISSION | 6 +- DESCRIPTION | 2 +- NEWS.md | 4 + R/vif_df.R | 27 ++++-- R/vif_select.R | 10 ++ README.md | 4 +- cran-comments.md | 161 +++++++++++++++++++++++++++++-- dev_scripts/noLD_check.R | 8 ++ man/vif_df.Rd | 13 ++- man/vif_select.Rd | 10 ++ tests/testthat/test-vif_df.R | 9 +- tests/testthat/test-vif_select.R | 9 ++ 12 files changed, 237 insertions(+), 26 deletions(-) create mode 100644 dev_scripts/noLD_check.R diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index afc6feb..65b1350 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 1.1.0 -Date: 2023-11-30 11:27:54 UTC -SHA: 2d1acfb0ed23f8362bafc4b8951a479173bb2fd1 +Version: 1.1.1 +Date: 2023-12-07 19:44:41 UTC +SHA: eefe4588d1b8b8614a681efef40f75f57bc08f91 diff --git a/DESCRIPTION b/DESCRIPTION index a484cc9..f6ae907 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: collinear Title: Seamless Multicollinearity Management -Version: 1.1.0 +Version: 1.1.1 Authors@R: person(given = "Blas M.", family = "Benito", , diff --git a/NEWS.md b/NEWS.md index e5d1f98..c1d2904 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# collinear 1.1.1 + +Hotfix of issue with solve(tol = 0) in systems with no large double support (noLD). This one wasn't fun. + # collinear 1.1.0 Added argument "smoothing" to `target_encoding_mean()` function to implement original target encoding method. diff --git a/R/vif_df.R b/R/vif_df.R index 8370230..f34869a 100644 --- a/R/vif_df.R +++ b/R/vif_df.R @@ -4,13 +4,14 @@ #' #' Computes the Variance Inflation Factor of all variables in a training data frame. #' +#' Warning: predictors with perfect correlation might cause errors, please use [cor_select()] to remove perfect correlations first. +#' #' The Variance Inflation Factor for a given variable `y` is computed as `1/(1-R2)`, where `R2` is the multiple R-squared of a multiple regression model fitted using `y` as response and all the remaining variables of the input data set as predictors. The equation can be interpreted as "the rate of perfect model's R-squared to the unexplained variance of this model". #' #' The possible range of VIF values is (1, Inf]. A VIF lower than 10 suggest that removing `y` from the data set would reduce overall multicollinearity. #' -#' This function computes the Variance Inflation Factor (VIF) in three steps: +#' This function computes the Variance Inflation Factor (VIF) in two steps: #' \itemize{ -#' \item Computes the correlation matrix between all pairs of predictors using `\link[stats]{cor}`. #' \item Applies `\link[base]{solve}` to obtain the precision matrix, which is the inverse of the covariance matrix. #' \item Uses `\link[base]{diag}` to extract the diagonal of the precision matrix, which contains the variance of the prediction of each predictor from all other predictors. #' } @@ -31,6 +32,14 @@ #' #subset to limit example run time #' vi <- vi[1:1000, ] #' +#' #reduce correlation in predictors with cor_select() +#' vi_predictors <- cor_select( +#' df = vi, +#' response = "vi_mean", +#' predictors = vi_predictors, +#' max_cor = 0.75 +#' ) +#' #' #without response #' #only numeric predictors are returned #' df <- vif_df( @@ -101,11 +110,17 @@ vif_df <- function( #and replace them with 0.99 or -0.99 cor.matrix.range <- range(cor.matrix[upper.tri(cor.matrix)]) if(1 %in% cor.matrix.range){ - cor.matrix[cor.matrix == 1] <- 0.99999999 + cor.matrix[cor.matrix == 1] <- 0.999 diag(cor.matrix) <- 1 } if(-1 %in% cor.matrix.range){ - cor.matrix[cor.matrix == -1] <- -0.99999999 + cor.matrix[cor.matrix == -1] <- -0.999 + } + + if(capabilities("long.double") == TRUE){ + tolerance = 0 + } else { + tolerance = .Machine$double.eps } #vif data frame @@ -113,7 +128,7 @@ vif_df <- function( { vif.df <- cor.matrix |> - solve(tol = 0) |> + solve(tol = tolerance) |> diag() |> data.frame(stringsAsFactors = FALSE) |> dplyr::rename(vif = 1) |> @@ -126,7 +141,7 @@ vif_df <- function( rownames(vif.df) <- NULL }, error = function(e) { - stop("the VIF computation failed. Please check for perfect correlations between predictors, or an excessive number of NA values in the 'df' argument.") + stop("the VIF computation failed. Please use cor_df() or cor_select() to check and remove perfect correlations from df before the VIF assessment.") } ) diff --git a/R/vif_select.R b/R/vif_select.R index e073986..f1d5c98 100644 --- a/R/vif_select.R +++ b/R/vif_select.R @@ -4,6 +4,8 @@ #' #' Automates multicollinearity management by selecting variables based on their Variance Inflation Factor (VIF). #' +#' Warning: predictors with perfect correlation might cause errors, please use [cor_select()] to remove perfect correlations first. +#' #' The [vif_select()] function is designed to automate the reduction of multicollinearity in a set of predictors by using Variance Inflation Factors. #' #' If the 'response' argument is provided, categorical predictors are converted to numeric via target encoding (see [target_encoding_lab()]). If the 'response' argument is not provided, categorical variables are ignored. @@ -45,6 +47,14 @@ #' vi <- vi[1:1000, ] #' vi_predictors <- vi_predictors[1:10] #' +#' #reduce correlation in predictors with cor_select() +#' vi_predictors <- cor_select( +#' df = vi, +#' response = "vi_mean", +#' predictors = vi_predictors, +#' max_cor = 0.75 +#' ) +#' #' #without response #' #without preference_order #' #permissive max_vif diff --git a/README.md b/README.md index ec28615..b58c1e8 100644 --- a/README.md +++ b/README.md @@ -516,7 +516,7 @@ selected_predictors_response <- cor_select( predictors = vi_predictors ) tictoc::toc() -#> 0.432 sec elapsed +#> 0.414 sec elapsed tictoc::tic() selected_predictors_no_response <- cor_select( @@ -524,7 +524,7 @@ selected_predictors_no_response <- cor_select( predictors = vi_predictors ) tictoc::toc() -#> 34.798 sec elapsed +#> 34.603 sec elapsed ``` ``` r diff --git a/cran-comments.md b/cran-comments.md index b9ab95e..f281047 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,19 +1,138 @@ -## Fixed issue +## Identified issue - Reading CITATION file fails with - there is no package called 'collinear' - when package is not installed. - -This error was caused by this line of code in inst/CITATION: +This new release responds to the following email by Prof Brian Ripley: -note <- sprintf("R package version %s", packageVersion("collinear")) + Dear maintainer, + + Please see the problems shown on + . + + Please correct before 2023-12-21 to safely retain your package on CRAN. + + Do remember to look at the 'Additional issues'. + + The CRAN Team + +I can only guess that the received notice refers to the noLD check shown in the "Additional issues" section, which reads as follows: + +* using log directory ‘/data/gannet/ripley/R/packages/tests-noLD/collinear.Rcheck’ +* using R Under development (unstable) (2023-12-04 r85659) +* using platform: x86_64-pc-linux-gnu +* ... +* checking tests ... + Running ‘spelling.R’ + Running ‘testthat.R’ [169s/170s] + [170s/170s] ERROR +Running the tests in ‘tests/testthat.R’ failed. +Complete output: + > # This file is part of the standard setup for testthat. + > # It is recommended that you do not modify it. + > # + > # Where should you do additional test configuration? + > # Learn more about the roles of various files in: + > # * https://r-pkgs.org/tests.html + > # * https://testthat.r-lib.org/reference/test_package.html#special-files + > + > library(testthat) + > library(collinear) + > + > test_check("collinear") + [ FAIL 1 | WARN 0 | SKIP 0 | PASS 120 ] + + ══ Failed tests ════════════════════════════════════════════════════════════════ + ── Error ('test-vif_df.R:37:3'): `vif_df()` works ────────────────────────────── + Error in `value[[3L]](cond)`: the VIF computation failed. Please check for perfect correlations between predictors, or an excessive number of NA values in the 'df' argument. + Backtrace: + ▆ + 1. └─collinear::vif_df(df = vi, response = "vi_mean", predictors = vi_predictors) at test-vif_df.R:37:3 + 2. └─base::tryCatch(...) + 3. └─base (local) tryCatchList(expr, classes, parentenv, handlers) + 4. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]]) + 5. └─value[[3L]](cond) + + [ FAIL 1 | WARN 0 | SKIP 0 | PASS 120 ] + Error: Test failures + Execution halted +* checking PDF version of manual ... OK +* checking HTML version of manual ... OK +* checking for non-standard things in the check directory ... OK +* checking for detritus in the temp directory ... OK +* DONE + +Status: 1 ERROR, 1 NOTE +See + ‘/data/gannet/ripley/R/packages/tests-noLD/collinear.Rcheck/00check.log’ +for details. + +Command exited with non-zero status 1 +Time 4:36.35, 249.72 + 10.45 + +## Solution + +The the offensive code was in the line 116 of the function vif_df(), which read as follows: + +```r +vif.df <- cor.matrix |> + solve(tol = 0) |> + diag() |> + ... +``` + +The new version reads: + +```r +vif.df <- cor.matrix |> + solve() |> + diag() |> + ... +``` + +However, when tol != 0, solve() breaks when variables with perfect correlations are introduced. As such, all failing test and examples now ensure that no perfect correlations reach solve(), as they produce errors. Also, a warning has been added to the documentation of vif_df() and vif_select() to let the user now that perfect correlations may break these functions. + +## Testing changes in noLD + +I tested these changes in the noLD platform provided by rhub: + +```r +rhub::local_check_linux(".", image = "rhub/debian-gcc-devel-nold") + +... + + +• image: rhub/debian-gcc-devel-nold +• output: + R-hub Linux builder script v0.10.0 (c) R Consortium, 2018-2019 + + Package: /tmp/RtmpLeiUIg/file3558a245af2e8/collinear_1.1.1.tar.gz + Docker image: rhub/debian-gcc-devel-nold + Env vars: + ... +• container_name: 3c762b6e-3759-4368-adfb-e71cad78f781-2 +• artifacts: + /tmp/RtmpLeiUIg/file3558a12449344 +• check_result: +── R CMD check results ────────────────────────────────────── collinear 1.1.1 ──── +Duration: 0ms + +❯ checking data for non-ASCII characters ... NOTE + Note: found 89 marked Latin-1 strings + Note: found 1203 marked UTF-8 strings + +0 errors ✔ | 0 warnings ✔ | 1 note ✖ +``` + +The only NOTE refers to Latin-1 and UTF-8 strings. -This line has been removed from the file. ## R CMD check results +── R CMD check results ────────────── collinear 1.1.1 ──── +Duration: 1m 15.1s + 0 errors ✔ | 0 warnings ✔ | 0 notes ✔ +R CMD check succeeded + ### Tested platforms + macos-latest (release) @@ -21,7 +140,29 @@ This line has been removed from the file. + ubuntu-latest (oldrel-1) + ubuntu-latest (release) + windows-latest (release) + +## Test results + +==> devtools::test() + +ℹ Testing collinear +✔ | F W S OK | Context +✔ | 5 | auc_score +✔ | 2 | case_weights +✔ | 16 | collinear [1.4s] +✔ | 6 | cor_df [9.8s] +✔ | 8 | cor_matrix [9.5s] +✔ | 4 | cor_select [18.7s] +✔ | 3 | cramer_v +✔ | 6 | identify [1.2s] +✔ | 33 | preference_order [13.6s] +✔ | 9 | target_encoding_lab +✔ | 11 | target_encoding_methods +✔ | 7 | validate [1.2s] +✔ | 8 | vif_df +✔ | 6 | vif_select -## Comments +══ Results ═════════════════════════════════════════════════ +Duration: 59.1 s -This is a new version (1.1.0) of the package with several new minor features and bug fixes described in the NEWS.md file. +[ FAIL 0 | WARN 0 | SKIP 0 | PASS 124 ] diff --git a/dev_scripts/noLD_check.R b/dev_scripts/noLD_check.R new file mode 100644 index 0000000..53801a3 --- /dev/null +++ b/dev_scripts/noLD_check.R @@ -0,0 +1,8 @@ +#relevant info: https://blog.r-hub.io/2019/05/21/nold/ +install.packages("rhub") + +#check online +rhub::check(".", platform = "debian-gcc-devel-nold") + +#check local +rhub::local_check_linux(".", image = "rhub/debian-gcc-devel-nold") diff --git a/man/vif_df.Rd b/man/vif_df.Rd index e3c9bcd..0c867be 100644 --- a/man/vif_df.Rd +++ b/man/vif_df.Rd @@ -21,13 +21,14 @@ Data frame with predictor names and VIF values \description{ Computes the Variance Inflation Factor of all variables in a training data frame. +Warning: predictors with perfect correlation might cause errors, please use \code{\link[=cor_select]{cor_select()}} to remove perfect correlations first. + The Variance Inflation Factor for a given variable \code{y} is computed as \code{1/(1-R2)}, where \code{R2} is the multiple R-squared of a multiple regression model fitted using \code{y} as response and all the remaining variables of the input data set as predictors. The equation can be interpreted as "the rate of perfect model's R-squared to the unexplained variance of this model". The possible range of VIF values is (1, Inf]. A VIF lower than 10 suggest that removing \code{y} from the data set would reduce overall multicollinearity. -This function computes the Variance Inflation Factor (VIF) in three steps: +This function computes the Variance Inflation Factor (VIF) in two steps: \itemize{ -\item Computes the correlation matrix between all pairs of predictors using \verb{\link[stats]\{cor\}}. \item Applies \verb{\link[base]\{solve\}} to obtain the precision matrix, which is the inverse of the covariance matrix. \item Uses \verb{\link[base]\{diag\}} to extract the diagonal of the precision matrix, which contains the variance of the prediction of each predictor from all other predictors. } @@ -42,6 +43,14 @@ data( #subset to limit example run time vi <- vi[1:1000, ] +#reduce correlation in predictors with cor_select() +vi_predictors <- cor_select( + df = vi, + response = "vi_mean", + predictors = vi_predictors, + max_cor = 0.75 +) + #without response #only numeric predictors are returned df <- vif_df( diff --git a/man/vif_select.Rd b/man/vif_select.Rd index b1902fc..3b8164b 100644 --- a/man/vif_select.Rd +++ b/man/vif_select.Rd @@ -32,6 +32,8 @@ Character vector with the names of the selected predictors. \description{ Automates multicollinearity management by selecting variables based on their Variance Inflation Factor (VIF). +Warning: predictors with perfect correlation might cause errors, please use \code{\link[=cor_select]{cor_select()}} to remove perfect correlations first. + The \code{\link[=vif_select]{vif_select()}} function is designed to automate the reduction of multicollinearity in a set of predictors by using Variance Inflation Factors. If the 'response' argument is provided, categorical predictors are converted to numeric via target encoding (see \code{\link[=target_encoding_lab]{target_encoding_lab()}}). If the 'response' argument is not provided, categorical variables are ignored. @@ -65,6 +67,14 @@ data( vi <- vi[1:1000, ] vi_predictors <- vi_predictors[1:10] +#reduce correlation in predictors with cor_select() +vi_predictors <- cor_select( + df = vi, + response = "vi_mean", + predictors = vi_predictors, + max_cor = 0.75 +) + #without response #without preference_order #permissive max_vif diff --git a/tests/testthat/test-vif_df.R b/tests/testthat/test-vif_df.R index 9bca63c..38d0ee6 100644 --- a/tests/testthat/test-vif_df.R +++ b/tests/testthat/test-vif_df.R @@ -4,8 +4,13 @@ testthat::test_that("`vif_df()` works", { vi <- vi[1:1000, ] #create a few perfect correlations in vi - vi$a <- vi$b <- vi$c <- vi$d <- vi$soil_sand - vi_predictors <- c(vi_predictors, "a", "b", "c") + #reduce correlation in predictors with cor_select() + vi_predictors <- cor_select( + df = vi, + response = "vi_mean", + predictors = vi_predictors, + max_cor = 0.75 + ) # Test with only numeric predictors df <- vif_df( diff --git a/tests/testthat/test-vif_select.R b/tests/testthat/test-vif_select.R index 3d1190a..cce8903 100644 --- a/tests/testthat/test-vif_select.R +++ b/tests/testthat/test-vif_select.R @@ -2,6 +2,15 @@ testthat::test_that("`vif_select()` works", { data(vi, vi_predictors) vi <- vi[1:1000, ] + #create a few perfect correlations in vi + #reduce correlation in predictors with cor_select() + vi_predictors <- cor_select( + df = vi, + response = "vi_mean", + predictors = vi_predictors, + max_cor = 0.75 + ) + # Test with only numeric predictors selected_predictors <- vif_select( df = vi,