diff --git a/DESCRIPTION b/DESCRIPTION index 12ee0457..8022a457 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: fastcpd Type: Package Title: Fast Change Point Detection Based on Dynamic Programming with Pruning with Sequential Gradient Descent -Version: 0.5.2 +Version: 0.5.3 Authors@R: c( person("Xingchi", "Li", email = "anthony.li@stat.tamu.edu", role = c("aut", "cre", "cph"), diff --git a/NEWS.md b/NEWS.md index 487d8ea7..d3ffc02e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# fastcpd 0.5.3 + +* Add more experiments but commented out for the sake of test time without + affecting the test coverage. +* Add examples in README. +* Add CRAN manual. + # fastcpd 0.5.2 * Add one-dimensional linear regression example with plot. diff --git a/README.Rmd b/README.Rmd index 41660b1a..efd169ff 100644 --- a/README.Rmd +++ b/README.Rmd @@ -7,7 +7,7 @@ output: github_document knitr::opts_chunk$set( collapse = TRUE, comment = "#>", - fig.path = "README-" + fig.path = "man/figures/README-" ) options(cli.hyperlink = FALSE) ``` @@ -33,7 +33,301 @@ If you'd like to learn how to use the fastcpd effectively, please refer to: * [Sequential Gradient Descent and Quasi-Newton's Method for Change-Point Analysis](https://proceedings.mlr.press/v206/zhang23b.html) -[documentation & examples](https://fastcpd.xingchi.li/reference/fastcpd.html#ref-examples) +## Examples + +[Documentation](https://fastcpd.xingchi.li/reference/fastcpd.html#ref-examples) + + +
+ Example usage of `fastcpd` + + ```{r examples, eval = TRUE} + # Linear regression + library(fastcpd) + set.seed(1) + p <- 3 + x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p)) + theta_0 <- rbind(c(1, 1.2, -1), c(-1, 0, 0.5), c(0.5, -0.3, 0.2)) + y <- c( + x[1:100, ] %*% theta_0[1, ] + rnorm(100, 0, 1), + x[101:200, ] %*% theta_0[2, ] + rnorm(100, 0, 1), + x[201:300, ] %*% theta_0[3, ] + rnorm(100, 0, 1) + ) + result <- fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + family = "gaussian" + ) + plot(result) + summary(result) + # Linear regression with one-dimensional covariate + library(fastcpd) + set.seed(1) + p <- 1 + x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p)) + theta_0 <- matrix(c(1, -1, 0.5)) + y <- c( + x[1:100, ] * theta_0[1, ] + rnorm(100, 0, 1), + x[101:200, ] * theta_0[2, ] + rnorm(100, 0, 1), + x[201:300, ] * theta_0[3, ] + rnorm(100, 0, 1) + ) + result <- fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + family = "gaussian" + ) + plot(result) + summary(result) + # Logistic regression + library(fastcpd) + set.seed(1) + x <- matrix(rnorm(1500, 0, 1), ncol = 5) + theta <- rbind(rnorm(5, 0, 1), rnorm(5, 2, 1)) + y <- c( + rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))), + rbinom(175, 1, 1 / (1 + exp(-x[126:300, ] %*% theta[2, ]))) + ) + result <- suppressWarnings(fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + family = "binomial" + )) + summary(result) + # Poisson regression + library(fastcpd) + set.seed(1) + p <- 3 + x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p)) + delta <- rnorm(p) + theta_0 <- c(1, 1.2, -1) + y <- c( + rpois(300, exp(x[1:300, ] %*% theta_0)), + rpois(400, exp(x[301:700, ] %*% (theta_0 + delta))), + rpois(300, exp(x[701:1000, ] %*% theta_0)), + rpois(100, exp(x[1001:1100, ] %*% (theta_0 - delta))), + rpois(200, exp(x[1101:1300, ] %*% theta_0)), + rpois(200, exp(x[1301:1500, ] %*% (theta_0 + delta))) + ) + result <- fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + beta = (p + 1) * log(1500) / 2, + k = function(x) 0, + family = "poisson", + epsilon = 1e-5 + ) + summary(result) + # Penalized linear regression + library(fastcpd) + set.seed(1) + n <- 1500 + p_true <- 6 + p <- 50 + x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p)) + theta_0 <- rbind( + runif(p_true, -5, -2), + runif(p_true, -3, 3), + runif(p_true, 2, 5), + runif(p_true, -5, 5) + ) + theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4)) + y <- c( + x[1:300, ] %*% theta_0[1, ] + rnorm(300, 0, 1), + x[301:700, ] %*% theta_0[2, ] + rnorm(400, 0, 1), + x[701:1000, ] %*% theta_0[3, ] + rnorm(300, 0, 1), + x[1001:1500, ] %*% theta_0[4, ] + rnorm(500, 0, 1) + ) + result <- fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + family = "lasso" + ) + plot(result) + summary(result) + # Custom cost function: logistic regression + library(fastcpd) + set.seed(1) + p <- 5 + x <- matrix(rnorm(375 * p, 0, 1), ncol = p) + theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1)) + y <- c( + rbinom(200, 1, 1 / (1 + exp(-x[1:200, ] %*% theta[1, ]))), + rbinom(175, 1, 1 / (1 + exp(-x[201:375, ] %*% theta[2, ]))) + ) + data <- data.frame(y = y, x = x) + result_builtin <- fastcpd( + formula = y ~ . - 1, + data = data, + family = "binomial" + ) + logistic_loss <- function(data, theta) { + x <- data[, -1] + y <- data[, 1] + u <- x %*% theta + nll <- -y * u + log(1 + exp(u)) + nll[u > 10] <- -y[u > 10] * u[u > 10] + u[u > 10] + sum(nll) + } + logistic_loss_gradient <- function(data, theta) { + x <- data[nrow(data), -1] + y <- data[nrow(data), 1] + c(-(y - 1 / (1 + exp(-x %*% theta)))) * x + } + logistic_loss_hessian <- function(data, theta) { + x <- data[nrow(data), -1] + prob <- 1 / (1 + exp(-x %*% theta)) + (x %o% x) * c((1 - prob) * prob) + } + result_custom <- fastcpd( + formula = y ~ . - 1, + data = data, + epsilon = 1e-5, + cost = logistic_loss, + cost_gradient = logistic_loss_gradient, + cost_hessian = logistic_loss_hessian + ) + cat( + "Change points detected by built-in logistic regression model: ", + result_builtin@cp_set, "\n", + "Change points detected by custom logistic regression model: ", + result_custom@cp_set, "\n", + sep = "" + ) + # Custom cost function: mean shift + library(fastcpd) + set.seed(1) + p <- 1 + data <- rbind( + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(100, p)), + mvtnorm::rmvnorm(400, mean = rep(50, p), sigma = diag(100, p)), + mvtnorm::rmvnorm(300, mean = rep(2, p), sigma = diag(100, p)) + ) + segment_count_guess <- 10 + block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) + block_count <- floor(nrow(data) / block_size) + data_all_vars <- rep(0, block_count) + for (block_index in seq_len(block_count)) { + block_start <- (block_index - 1) * block_size + 1 + block_end <- if (block_index < block_count) block_index * block_size else nrow(data) + data_all_vars[block_index] <- var(data[block_start:block_end, ]) + } + data_all_var <- mean(data_all_vars) + mean_loss <- function(data) { + n <- nrow(data) + (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + + n / 2 * (log(data_all_var) + log(2 * pi)) + } + mean_loss_result <- fastcpd( + formula = ~ . - 1, + data = data.frame(data), + beta = (p + 1) * log(nrow(data)) / 2, + p = p, + cost = mean_loss + ) + summary(mean_loss_result) + # Custom cost function: variance change + library(fastcpd) + set.seed(1) + p <- 1 + data <- rbind.data.frame( + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(400, mean = rep(0, p), sigma = diag(50, p)), + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(2, p)) + ) + data_all_mu <- colMeans(data) + var_loss <- function(data) { + demeaned_data_norm <- norm(sweep(data, 2, data_all_mu), type = "F") + nrow(data) * (1 + log(2 * pi) + log(demeaned_data_norm^2 / nrow(data))) / 2 + } + var_loss_result <- fastcpd( + formula = ~ . - 1, + data = data, + beta = (p + 1) * log(nrow(data)) / 2, + p = p, + cost = var_loss + ) + summary(var_loss_result) + # Custom cost function: mean shift and variance change + library(fastcpd) + set.seed(1) + p <- 1 + data <- rbind.data.frame( + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(50, p)), + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(300, mean = rep(10, p), sigma = diag(50, p)) + ) + meanvar_loss <- function(data) { + loss_part <- (colSums(data^2) - colSums(data)^2 / nrow(data)) / nrow(data) + nrow(data) * (1 + log(2 * pi) + log(loss_part)) / 2 + } + meanvar_loss_result <- fastcpd( + formula = ~ . - 1, + data = data, + beta = (2 * p + 1) * log(nrow(data)) / 2, + p = 2 * p, + cost = meanvar_loss + ) + summary(meanvar_loss_result) + # Custom cost function: Huber loss + library(fastcpd) + set.seed(1) + n <- 400 + 300 + 500 + p <- 5 + x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(p)) + theta <- rbind( + mvtnorm::rmvnorm(1, mean = rep(0, p - 3), sigma = diag(p - 3)), + mvtnorm::rmvnorm(1, mean = rep(5, p - 3), sigma = diag(p - 3)), + mvtnorm::rmvnorm(1, mean = rep(9, p - 3), sigma = diag(p - 3)) + ) + theta <- cbind(theta, matrix(0, 3, 3)) + theta <- theta[rep(seq_len(3), c(400, 300, 500)), ] + y_true <- rowSums(x * theta) + factor <- c( + 2 * stats::rbinom(400, size = 1, prob = 0.95) - 1, + 2 * stats::rbinom(300, size = 1, prob = 0.95) - 1, + 2 * stats::rbinom(500, size = 1, prob = 0.95) - 1 + ) + y <- factor * y_true + stats::rnorm(n) + data <- cbind.data.frame(y, x) + huber_threshold <- 1 + huber_loss <- function(data, theta) { + residual <- data[, 1] - data[, -1, drop = FALSE] %*% theta + indicator <- abs(residual) <= huber_threshold + sum( + residual^2 / 2 * indicator + + huber_threshold * (abs(residual) - huber_threshold / 2) * (1 - indicator) + ) + } + huber_loss_gradient <- function(data, theta) { + residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta) + if (abs(residual) <= huber_threshold) { + -residual * data[nrow(data), -1] + } else { + -huber_threshold * sign(residual) * data[nrow(data), -1] + } + } + huber_loss_hessian <- function(data, theta) { + residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta) + if (abs(residual) <= huber_threshold) { + outer(data[nrow(data), -1], data[nrow(data), -1]) + } else { + 0.01 * diag(length(theta)) + } + } + huber_regression_result <- fastcpd( + formula = y ~ . - 1, + data = data, + beta = (p + 1) * log(n) / 2, + cost = huber_loss, + cost_gradient = huber_loss_gradient, + cost_hessian = huber_loss_hessian + ) + summary(huber_regression_result) + ``` +
## Installation diff --git a/README.md b/README.md index 1fefe983..d69c9fef 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,483 @@ to: Change-Point Analysis](https://proceedings.mlr.press/v206/zhang23b.html) -[documentation & -examples](https://fastcpd.xingchi.li/reference/fastcpd.html#ref-examples) +## Examples + +[Documentation](https://fastcpd.xingchi.li/reference/fastcpd.html#ref-examples) + + +
+ +Example usage of `fastcpd` + + +``` r + # Linear regression + library(fastcpd) + set.seed(1) + p <- 3 + x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p)) + theta_0 <- rbind(c(1, 1.2, -1), c(-1, 0, 0.5), c(0.5, -0.3, 0.2)) + y <- c( + x[1:100, ] %*% theta_0[1, ] + rnorm(100, 0, 1), + x[101:200, ] %*% theta_0[2, ] + rnorm(100, 0, 1), + x[201:300, ] %*% theta_0[3, ] + rnorm(100, 0, 1) + ) + result <- fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + family = "gaussian" + ) + plot(result) +``` + +![](man/figures/README-examples-1.png) + +``` r + summary(result) + #> + #> Call: + #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x), + #> family = "gaussian") + #> + #> Change points: + #> 98 202 + #> + #> Cost values: + #> 53.44023 53.1441 45.04974 + #> + #> Parameters: + #> segment 1 segment 2 segment 3 + #> 1 0.9704022 -1.07884004 0.5925092 + #> 2 1.1786074 -0.01757927 -0.5287126 + #> 3 -0.9258587 0.63906143 0.1929411 + # Linear regression with one-dimensional covariate + library(fastcpd) + set.seed(1) + p <- 1 + x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p)) + theta_0 <- matrix(c(1, -1, 0.5)) + y <- c( + x[1:100, ] * theta_0[1, ] + rnorm(100, 0, 1), + x[101:200, ] * theta_0[2, ] + rnorm(100, 0, 1), + x[201:300, ] * theta_0[3, ] + rnorm(100, 0, 1) + ) + result <- fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + family = "gaussian" + ) + plot(result) +``` + +![](man/figures/README-examples-2.png) + +``` r + summary(result) + #> + #> Call: + #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x), + #> family = "gaussian") + #> + #> Change points: + #> 100 194 + #> + #> Cost values: + #> 48.71927 57.20738 63.15088 + #> + #> Parameters: + #> segment 1 segment 2 segment 3 + #> 1 0.9520606 -0.8054074 0.3692224 + # Logistic regression + library(fastcpd) + set.seed(1) + x <- matrix(rnorm(1500, 0, 1), ncol = 5) + theta <- rbind(rnorm(5, 0, 1), rnorm(5, 2, 1)) + y <- c( + rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))), + rbinom(175, 1, 1 / (1 + exp(-x[126:300, ] %*% theta[2, ]))) + ) + result <- suppressWarnings(fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + family = "binomial" + )) + summary(result) + #> + #> Call: + #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x), + #> family = "binomial") + #> + #> Change points: + #> 126 + #> + #> Cost values: + #> 56.90525 30.76875 + #> + #> Parameters: + #> segment 1 segment 2 + #> 1 0.7259293 1.878525 + #> 2 -1.0294802 2.704376 + #> 3 1.0576503 3.702310 + #> 4 -0.8812767 2.258796 + #> 5 0.2419351 2.524173 + # Poisson regression + library(fastcpd) + set.seed(1) + p <- 3 + x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p)) + delta <- rnorm(p) + theta_0 <- c(1, 1.2, -1) + y <- c( + rpois(300, exp(x[1:300, ] %*% theta_0)), + rpois(400, exp(x[301:700, ] %*% (theta_0 + delta))), + rpois(300, exp(x[701:1000, ] %*% theta_0)), + rpois(100, exp(x[1001:1100, ] %*% (theta_0 - delta))), + rpois(200, exp(x[1101:1300, ] %*% theta_0)), + rpois(200, exp(x[1301:1500, ] %*% (theta_0 + delta))) + ) + result <- fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + beta = (p + 1) * log(1500) / 2, + k = function(x) 0, + family = "poisson", + epsilon = 1e-5 + ) + summary(result) + #> + #> Call: + #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x), + #> beta = (p + 1) * log(1500)/2, k = function(x) 0, family = "poisson", + #> epsilon = 1e-05) + #> + #> Change points: + #> 329 728 1021 1107 1325 + #> + #> Cost values: + #> 14425.87 13971.23 697.2187 107.5353 380.7153 51.93594 + #> + #> Parameters: + #> segment 1 segment 2 segment 3 segment 4 segment 5 segment 6 + #> 1 2.60927673 1.9255183 0.7405125 -0.3965022 1.117753 2.5479308 + #> 2 0.02398457 0.1068924 1.4721444 1.8677797 1.019035 0.4947115 + #> 3 -1.34361104 -2.7353603 -0.8906937 0.4651667 -1.178933 -2.5038966 + # Penalized linear regression + library(fastcpd) + set.seed(1) + n <- 1500 + p_true <- 6 + p <- 50 + x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p)) + theta_0 <- rbind( + runif(p_true, -5, -2), + runif(p_true, -3, 3), + runif(p_true, 2, 5), + runif(p_true, -5, 5) + ) + theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4)) + y <- c( + x[1:300, ] %*% theta_0[1, ] + rnorm(300, 0, 1), + x[301:700, ] %*% theta_0[2, ] + rnorm(400, 0, 1), + x[701:1000, ] %*% theta_0[3, ] + rnorm(300, 0, 1), + x[1001:1500, ] %*% theta_0[4, ] + rnorm(500, 0, 1) + ) + result <- fastcpd( + formula = y ~ . - 1, + data = data.frame(y = y, x = x), + family = "lasso" + ) + plot(result) +``` + +![](man/figures/README-examples-3.png) + +``` r + summary(result) + #> + #> Call: + #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x), + #> family = "lasso") + #> + #> Change points: + #> 300 701 1000 + #> + #> Cost values: + #> 181.7214 237.5852 164.4635 279.2901 + #> + #> Parameters: + #> 50 x 4 sparse Matrix of class "dgCMatrix" + #> segment 1 segment 2 segment 3 segment 4 + #> [1,] -2.950291 0.3790728 4.108208 -0.04955231 + #> [2,] -2.896748 -0.3979970 3.952567 3.15522854 + #> [3,] -2.875025 -0.2419562 2.634441 2.82389558 + #> [4,] -1.982441 0.5145819 3.370837 -0.58643066 + #> [5,] -3.116640 -0.5223081 2.140809 -3.42307221 + #> [6,] -1.908975 0.4789674 4.852307 . + #> [7,] . . . . + #> [8,] . . . . + #> [9,] . . . . + #> [10,] . . . . + #> [11,] . . . . + #> [12,] . . . . + #> [13,] . . . . + #> [14,] . . . . + #> [15,] . . . . + #> [16,] . . . . + #> [17,] . . . . + #> [18,] . . . . + #> [19,] . . . . + #> [20,] . . . . + #> [21,] . . . . + #> [22,] . . . . + #> [23,] . . . . + #> [24,] . . . . + #> [25,] . . . . + #> [26,] . . . . + #> [27,] . . . . + #> [28,] . . . . + #> [29,] . . . . + #> [30,] . . . . + #> [31,] . . . . + #> [32,] . . . . + #> [33,] . . . . + #> [34,] . . . . + #> [35,] . . . . + #> [36,] . . . . + #> [37,] . . . . + #> [38,] . . . . + #> [39,] . . . . + #> [40,] . . . . + #> [41,] . . . . + #> [42,] . . . . + #> [43,] . . . . + #> [44,] . . . . + #> [45,] . . . . + #> [46,] . . . . + #> [47,] . . . . + #> [48,] . . . . + #> [49,] . . . . + #> [50,] . . . . + # Custom cost function: logistic regression + library(fastcpd) + set.seed(1) + p <- 5 + x <- matrix(rnorm(375 * p, 0, 1), ncol = p) + theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1)) + y <- c( + rbinom(200, 1, 1 / (1 + exp(-x[1:200, ] %*% theta[1, ]))), + rbinom(175, 1, 1 / (1 + exp(-x[201:375, ] %*% theta[2, ]))) + ) + data <- data.frame(y = y, x = x) + result_builtin <- fastcpd( + formula = y ~ . - 1, + data = data, + family = "binomial" + ) + #> Warning: fit_glm: fitted probabilities numerically 0 or 1 occurred + + #> Warning: fit_glm: fitted probabilities numerically 0 or 1 occurred + + #> Warning: fit_glm: fitted probabilities numerically 0 or 1 occurred + logistic_loss <- function(data, theta) { + x <- data[, -1] + y <- data[, 1] + u <- x %*% theta + nll <- -y * u + log(1 + exp(u)) + nll[u > 10] <- -y[u > 10] * u[u > 10] + u[u > 10] + sum(nll) + } + logistic_loss_gradient <- function(data, theta) { + x <- data[nrow(data), -1] + y <- data[nrow(data), 1] + c(-(y - 1 / (1 + exp(-x %*% theta)))) * x + } + logistic_loss_hessian <- function(data, theta) { + x <- data[nrow(data), -1] + prob <- 1 / (1 + exp(-x %*% theta)) + (x %o% x) * c((1 - prob) * prob) + } + result_custom <- fastcpd( + formula = y ~ . - 1, + data = data, + epsilon = 1e-5, + cost = logistic_loss, + cost_gradient = logistic_loss_gradient, + cost_hessian = logistic_loss_hessian + ) + cat( + "Change points detected by built-in logistic regression model: ", + result_builtin@cp_set, "\n", + "Change points detected by custom logistic regression model: ", + result_custom@cp_set, "\n", + sep = "" + ) + #> Change points detected by built-in logistic regression model: 200 + #> Change points detected by custom logistic regression model: 201 + # Custom cost function: mean shift + library(fastcpd) + set.seed(1) + p <- 1 + data <- rbind( + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(100, p)), + mvtnorm::rmvnorm(400, mean = rep(50, p), sigma = diag(100, p)), + mvtnorm::rmvnorm(300, mean = rep(2, p), sigma = diag(100, p)) + ) + segment_count_guess <- 10 + block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) + block_count <- floor(nrow(data) / block_size) + data_all_vars <- rep(0, block_count) + for (block_index in seq_len(block_count)) { + block_start <- (block_index - 1) * block_size + 1 + block_end <- if (block_index < block_count) block_index * block_size else nrow(data) + data_all_vars[block_index] <- var(data[block_start:block_end, ]) + } + data_all_var <- mean(data_all_vars) + mean_loss <- function(data) { + n <- nrow(data) + (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + + n / 2 * (log(data_all_var) + log(2 * pi)) + } + mean_loss_result <- fastcpd( + formula = ~ . - 1, + data = data.frame(data), + beta = (p + 1) * log(nrow(data)) / 2, + p = p, + cost = mean_loss + ) + summary(mean_loss_result) + #> + #> Call: + #> fastcpd(formula = ~. - 1, data = data.frame(data), beta = (p + + #> 1) * log(nrow(data))/2, p = p, cost = mean_loss) + #> + #> Change points: + #> 300 700 + # Custom cost function: variance change + library(fastcpd) + set.seed(1) + p <- 1 + data <- rbind.data.frame( + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(400, mean = rep(0, p), sigma = diag(50, p)), + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(2, p)) + ) + data_all_mu <- colMeans(data) + var_loss <- function(data) { + demeaned_data_norm <- norm(sweep(data, 2, data_all_mu), type = "F") + nrow(data) * (1 + log(2 * pi) + log(demeaned_data_norm^2 / nrow(data))) / 2 + } + var_loss_result <- fastcpd( + formula = ~ . - 1, + data = data, + beta = (p + 1) * log(nrow(data)) / 2, + p = p, + cost = var_loss + ) + summary(var_loss_result) + #> + #> Call: + #> fastcpd(formula = ~. - 1, data = data, beta = (p + 1) * log(nrow(data))/2, + #> p = p, cost = var_loss) + #> + #> Change points: + #> 300 699 + # Custom cost function: mean shift and variance change + library(fastcpd) + set.seed(1) + p <- 1 + data <- rbind.data.frame( + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(50, p)), + mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)), + mvtnorm::rmvnorm(300, mean = rep(10, p), sigma = diag(50, p)) + ) + meanvar_loss <- function(data) { + loss_part <- (colSums(data^2) - colSums(data)^2 / nrow(data)) / nrow(data) + nrow(data) * (1 + log(2 * pi) + log(loss_part)) / 2 + } + meanvar_loss_result <- fastcpd( + formula = ~ . - 1, + data = data, + beta = (2 * p + 1) * log(nrow(data)) / 2, + p = 2 * p, + cost = meanvar_loss + ) + summary(meanvar_loss_result) + #> + #> Call: + #> fastcpd(formula = ~. - 1, data = data, beta = (2 * p + 1) * log(nrow(data))/2, + #> p = 2 * p, cost = meanvar_loss) + #> + #> Change points: + #> 300 700 1000 1300 1700 + # Custom cost function: Huber loss + library(fastcpd) + set.seed(1) + n <- 400 + 300 + 500 + p <- 5 + x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(p)) + theta <- rbind( + mvtnorm::rmvnorm(1, mean = rep(0, p - 3), sigma = diag(p - 3)), + mvtnorm::rmvnorm(1, mean = rep(5, p - 3), sigma = diag(p - 3)), + mvtnorm::rmvnorm(1, mean = rep(9, p - 3), sigma = diag(p - 3)) + ) + theta <- cbind(theta, matrix(0, 3, 3)) + theta <- theta[rep(seq_len(3), c(400, 300, 500)), ] + y_true <- rowSums(x * theta) + factor <- c( + 2 * stats::rbinom(400, size = 1, prob = 0.95) - 1, + 2 * stats::rbinom(300, size = 1, prob = 0.95) - 1, + 2 * stats::rbinom(500, size = 1, prob = 0.95) - 1 + ) + y <- factor * y_true + stats::rnorm(n) + data <- cbind.data.frame(y, x) + huber_threshold <- 1 + huber_loss <- function(data, theta) { + residual <- data[, 1] - data[, -1, drop = FALSE] %*% theta + indicator <- abs(residual) <= huber_threshold + sum( + residual^2 / 2 * indicator + + huber_threshold * (abs(residual) - huber_threshold / 2) * (1 - indicator) + ) + } + huber_loss_gradient <- function(data, theta) { + residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta) + if (abs(residual) <= huber_threshold) { + -residual * data[nrow(data), -1] + } else { + -huber_threshold * sign(residual) * data[nrow(data), -1] + } + } + huber_loss_hessian <- function(data, theta) { + residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta) + if (abs(residual) <= huber_threshold) { + outer(data[nrow(data), -1], data[nrow(data), -1]) + } else { + 0.01 * diag(length(theta)) + } + } + huber_regression_result <- fastcpd( + formula = y ~ . - 1, + data = data, + beta = (p + 1) * log(n) / 2, + cost = huber_loss, + cost_gradient = huber_loss_gradient, + cost_hessian = huber_loss_hessian + ) + summary(huber_regression_result) + #> + #> Call: + #> fastcpd(formula = y ~ . - 1, data = data, beta = (p + 1) * log(n)/2, + #> cost = huber_loss, cost_gradient = huber_loss_gradient, cost_hessian = huber_loss_hessian) + #> + #> Change points: + #> 401 726 +``` + +
## Installation diff --git a/man/figures/README-examples-1.png b/man/figures/README-examples-1.png new file mode 100644 index 00000000..79778f42 Binary files /dev/null and b/man/figures/README-examples-1.png differ diff --git a/man/figures/README-examples-2.png b/man/figures/README-examples-2.png new file mode 100644 index 00000000..78520f99 Binary files /dev/null and b/man/figures/README-examples-2.png differ diff --git a/man/figures/README-examples-3.png b/man/figures/README-examples-3.png new file mode 100644 index 00000000..2de50605 Binary files /dev/null and b/man/figures/README-examples-3.png differ diff --git a/man/figures/manual.pdf b/man/figures/manual.pdf new file mode 100644 index 00000000..1cfc5b2e Binary files /dev/null and b/man/figures/manual.pdf differ diff --git a/tests/testthat/test-fastcpd.R b/tests/testthat/test-fastcpd.R index 89b27976..be51d3e5 100644 --- a/tests/testthat/test-fastcpd.R +++ b/tests/testthat/test-fastcpd.R @@ -461,80 +461,158 @@ test_that("huber regression with 0.1 vanilla", { expect_equal(huber_regression_result@cp_set, c(401, 726)) }) -test_that("confidence interval experiment", { - set.seed(1) - kDimension <- 1 - change_point_locations <- NULL - for (experiment_id in seq_len(20)) { - data <- rbind( - mvtnorm::rmvnorm(300, mean = rep(0, kDimension), sigma = diag(100, kDimension)), - mvtnorm::rmvnorm(400, mean = rep(50, kDimension), sigma = diag(100, kDimension)), - mvtnorm::rmvnorm(300, mean = rep(2, kDimension), sigma = diag(100, kDimension)) - ) - segment_count_guess <- 10 - block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) - block_count <- floor(nrow(data) / block_size) - data_all_vars <- rep(0, block_count) - for (block_index in seq_len(block_count)) { - block_start <- (block_index - 1) * block_size + 1 - block_end <- if (block_index < block_count) block_index * block_size else nrow(data) - data_all_vars[block_index] <- var(data[block_start:block_end, ]) - } - data_all_var <- mean(data_all_vars) - mean_loss <- function(data) { - n <- nrow(data) - (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + - n / 2 * (log(data_all_var) + log(2 * pi)) - } - mean_loss_result <- fastcpd( - formula = ~ . - 1, - data = data.frame(data), - beta = (kDimension + 1) * log(nrow(data)) / 2, - p = kDimension, - cost = mean_loss - ) - change_point_locations <- c(change_point_locations, mean_loss_result@cp_set) - } - - change_point_locations_cookie_bucket <- NULL - cookie_bucket_id_list <- sample.int(n = 20, size = 1000, replace = TRUE) - all_data <- data - for (cookie_bucket_id in seq_len(20)) { - data <- all_data[cookie_bucket_id_list != cookie_bucket_id, , drop = FALSE] - segment_count_guess <- 10 - block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) - block_count <- floor(nrow(data) / block_size) - data_all_vars <- rep(0, block_count) - for (block_index in seq_len(block_count)) { - block_start <- (block_index - 1) * block_size + 1 - block_end <- if (block_index < block_count) block_index * block_size else nrow(data) - data_all_vars[block_index] <- var(data[block_start:block_end, ]) - } - data_all_var <- mean(data_all_vars) - mean_loss <- function(data) { - n <- nrow(data) - (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + - n / 2 * (log(data_all_var) + log(2 * pi)) - } - mean_loss_result <- fastcpd( - formula = ~ . - 1, - data = data.frame(data), - beta = (kDimension + 1) * log(nrow(data)) / 2, - p = kDimension, - cost = mean_loss - ) - - for (cp in mean_loss_result@cp_set) { - ordinal_mapped_cp <- which(cumsum(cookie_bucket_id_list != cookie_bucket_id) == cp)[1] - change_point_locations_cookie_bucket <- c(change_point_locations_cookie_bucket, ordinal_mapped_cp) - } - } - - table_change_point_locations <- table(change_point_locations) - expect_equal(rownames(table_change_point_locations), c("269", "299", "300", "700")) - expect_equal(unname(table_change_point_locations), c(1, 1, 19, 20), ignore_attr = TRUE) - - table_change_point_locations_cookie_bucket <- table(change_point_locations_cookie_bucket) - expect_equal(rownames(table_change_point_locations_cookie_bucket), c("299", "300", "697", "700")) - expect_equal(unname(table_change_point_locations_cookie_bucket), c(1, 19, 1, 19), ignore_attr = TRUE) -}) +# test_that("confidence interval experiment", { +# set.seed(1) +# kDimension <- 1 +# change_point_locations <- NULL +# for (experiment_id in seq_len(20)) { +# data <- rbind( +# mvtnorm::rmvnorm(300, mean = rep(0, kDimension), sigma = diag(100, kDimension)), +# mvtnorm::rmvnorm(400, mean = rep(50, kDimension), sigma = diag(100, kDimension)), +# mvtnorm::rmvnorm(300, mean = rep(2, kDimension), sigma = diag(100, kDimension)) +# ) +# segment_count_guess <- 10 +# block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) +# block_count <- floor(nrow(data) / block_size) +# data_all_vars <- rep(0, block_count) +# for (block_index in seq_len(block_count)) { +# block_start <- (block_index - 1) * block_size + 1 +# block_end <- if (block_index < block_count) block_index * block_size else nrow(data) +# data_all_vars[block_index] <- var(data[block_start:block_end, ]) +# } +# data_all_var <- mean(data_all_vars) +# mean_loss <- function(data) { +# n <- nrow(data) +# (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + +# n / 2 * (log(data_all_var) + log(2 * pi)) +# } +# mean_loss_result <- fastcpd( +# formula = ~ . - 1, +# data = data.frame(data), +# beta = (kDimension + 1) * log(nrow(data)) / 2, +# p = kDimension, +# cost = mean_loss +# ) +# change_point_locations <- c(change_point_locations, mean_loss_result@cp_set) +# } + +# change_point_locations_cookie_bucket <- NULL +# cookie_bucket_id_list <- sample.int(n = 20, size = 1000, replace = TRUE) +# all_data <- data +# for (cookie_bucket_id in seq_len(20)) { +# data <- all_data[cookie_bucket_id_list != cookie_bucket_id, , drop = FALSE] +# segment_count_guess <- 10 +# block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) +# block_count <- floor(nrow(data) / block_size) +# data_all_vars <- rep(0, block_count) +# for (block_index in seq_len(block_count)) { +# block_start <- (block_index - 1) * block_size + 1 +# block_end <- if (block_index < block_count) block_index * block_size else nrow(data) +# data_all_vars[block_index] <- var(data[block_start:block_end, ]) +# } +# data_all_var <- mean(data_all_vars) +# mean_loss <- function(data) { +# n <- nrow(data) +# (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + +# n / 2 * (log(data_all_var) + log(2 * pi)) +# } +# mean_loss_result <- fastcpd( +# formula = ~ . - 1, +# data = data.frame(data), +# beta = (kDimension + 1) * log(nrow(data)) / 2, +# p = kDimension, +# cost = mean_loss +# ) + +# for (cp in mean_loss_result@cp_set) { +# ordinal_mapped_cp <- which(cumsum(cookie_bucket_id_list != cookie_bucket_id) == cp)[1] +# change_point_locations_cookie_bucket <- c(change_point_locations_cookie_bucket, ordinal_mapped_cp) +# } +# } + +# table_change_point_locations <- table(change_point_locations) +# expect_equal(rownames(table_change_point_locations), c("269", "299", "300", "700")) +# expect_equal(unname(table_change_point_locations), c(1, 1, 19, 20), ignore_attr = TRUE) + +# table_change_point_locations_cookie_bucket <- table(change_point_locations_cookie_bucket) +# expect_equal(rownames(table_change_point_locations_cookie_bucket), c("299", "300", "697", "700")) +# expect_equal(unname(table_change_point_locations_cookie_bucket), c(1, 19, 1, 19), ignore_attr = TRUE) +# }) + +# test_that("confidence interval experiment with one change point", { +# set.seed(1) +# kDimension <- 1 +# change_point_locations <- list() +# change_point_locations_cookie_bucket <- rep(list(NULL), 1000) +# containing_change_point <- rep(FALSE, 1000) +# for (experiment_id in seq_len(1000)) { +# data <- rbind( +# mvtnorm::rmvnorm(50, mean = rep(0, kDimension), sigma = diag(100, kDimension)), +# mvtnorm::rmvnorm(50, mean = rep(50, kDimension), sigma = diag(100, kDimension)) +# ) +# segment_count_guess <- 10 +# block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) +# block_count <- floor(nrow(data) / block_size) +# data_all_vars <- rep(0, block_count) +# for (block_index in seq_len(block_count)) { +# block_start <- (block_index - 1) * block_size + 1 +# block_end <- if (block_index < block_count) block_index * block_size else nrow(data) +# data_all_vars[block_index] <- var(data[block_start:block_end, ]) +# } +# data_all_var <- mean(data_all_vars) +# mean_loss <- function(data) { +# n <- nrow(data) +# (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + +# n / 2 * (log(data_all_var) + log(2 * pi)) +# } +# mean_loss_result <- fastcpd( +# formula = ~ . - 1, +# data = data.frame(data), +# beta = (kDimension + 1) * log(nrow(data)) / 2, +# p = kDimension, +# cost = mean_loss +# ) +# change_point_locations[[experiment_id]] <- mean_loss_result@cp_set + +# cookie_bucket_id_list <- sample.int(n = 20, size = 50 + 50, replace = TRUE) +# all_data <- data +# for (cookie_bucket_id in seq_len(20)) { +# data <- all_data[cookie_bucket_id_list != cookie_bucket_id, , drop = FALSE] +# segment_count_guess <- 10 +# block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2) +# block_count <- floor(nrow(data) / block_size) +# data_all_vars <- rep(0, block_count) +# for (block_index in seq_len(block_count)) { +# block_start <- (block_index - 1) * block_size + 1 +# block_end <- if (block_index < block_count) block_index * block_size else nrow(data) +# data_all_vars[block_index] <- var(data[block_start:block_end, ]) +# } +# data_all_var <- mean(data_all_vars) +# mean_loss <- function(data) { +# n <- nrow(data) +# (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var + +# n / 2 * (log(data_all_var) + log(2 * pi)) +# } +# mean_loss_result <- fastcpd( +# formula = ~ . - 1, +# data = data.frame(data), +# beta = (kDimension + 1) * log(nrow(data)) / 2, +# p = kDimension, +# cost = mean_loss +# ) + +# for (cp in mean_loss_result@cp_set) { +# ordinal_mapped_cp <- which(cumsum(cookie_bucket_id_list != cookie_bucket_id) == cp)[1] +# change_point_locations_cookie_bucket[[experiment_id]] <- c(change_point_locations_cookie_bucket[[experiment_id]], ordinal_mapped_cp) +# } +# } +# if ( +# quantile(change_point_locations_cookie_bucket[[experiment_id]], 0.025) <= 50 && +# quantile(change_point_locations_cookie_bucket[[experiment_id]], 0.975) >= 50 +# ) { +# containing_change_point[experiment_id] <- TRUE +# } +# } + +# expect_equal(sum(containing_change_point), 927) +# })