diff --git a/DESCRIPTION b/DESCRIPTION
index 12ee0457..8022a457 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: fastcpd
Type: Package
Title: Fast Change Point Detection Based on Dynamic Programming with Pruning
with Sequential Gradient Descent
-Version: 0.5.2
+Version: 0.5.3
Authors@R: c(
person("Xingchi", "Li", email = "anthony.li@stat.tamu.edu",
role = c("aut", "cre", "cph"),
diff --git a/NEWS.md b/NEWS.md
index 487d8ea7..d3ffc02e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+# fastcpd 0.5.3
+
+* Add more experiments but commented out for the sake of test time without
+ affecting the test coverage.
+* Add examples in README.
+* Add CRAN manual.
+
# fastcpd 0.5.2
* Add one-dimensional linear regression example with plot.
diff --git a/README.Rmd b/README.Rmd
index 41660b1a..efd169ff 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -7,7 +7,7 @@ output: github_document
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
- fig.path = "README-"
+ fig.path = "man/figures/README-"
)
options(cli.hyperlink = FALSE)
```
@@ -33,7 +33,301 @@ If you'd like to learn how to use the fastcpd effectively, please refer to:
* [Sequential Gradient Descent and Quasi-Newton's Method for Change-Point Analysis](https://proceedings.mlr.press/v206/zhang23b.html)
-[documentation & examples](https://fastcpd.xingchi.li/reference/fastcpd.html#ref-examples)
+## Examples
+
+[Documentation](https://fastcpd.xingchi.li/reference/fastcpd.html#ref-examples)
+
+
+
+ Example usage of `fastcpd`
+
+ ```{r examples, eval = TRUE}
+ # Linear regression
+ library(fastcpd)
+ set.seed(1)
+ p <- 3
+ x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p))
+ theta_0 <- rbind(c(1, 1.2, -1), c(-1, 0, 0.5), c(0.5, -0.3, 0.2))
+ y <- c(
+ x[1:100, ] %*% theta_0[1, ] + rnorm(100, 0, 1),
+ x[101:200, ] %*% theta_0[2, ] + rnorm(100, 0, 1),
+ x[201:300, ] %*% theta_0[3, ] + rnorm(100, 0, 1)
+ )
+ result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ family = "gaussian"
+ )
+ plot(result)
+ summary(result)
+ # Linear regression with one-dimensional covariate
+ library(fastcpd)
+ set.seed(1)
+ p <- 1
+ x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p))
+ theta_0 <- matrix(c(1, -1, 0.5))
+ y <- c(
+ x[1:100, ] * theta_0[1, ] + rnorm(100, 0, 1),
+ x[101:200, ] * theta_0[2, ] + rnorm(100, 0, 1),
+ x[201:300, ] * theta_0[3, ] + rnorm(100, 0, 1)
+ )
+ result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ family = "gaussian"
+ )
+ plot(result)
+ summary(result)
+ # Logistic regression
+ library(fastcpd)
+ set.seed(1)
+ x <- matrix(rnorm(1500, 0, 1), ncol = 5)
+ theta <- rbind(rnorm(5, 0, 1), rnorm(5, 2, 1))
+ y <- c(
+ rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))),
+ rbinom(175, 1, 1 / (1 + exp(-x[126:300, ] %*% theta[2, ])))
+ )
+ result <- suppressWarnings(fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ family = "binomial"
+ ))
+ summary(result)
+ # Poisson regression
+ library(fastcpd)
+ set.seed(1)
+ p <- 3
+ x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p))
+ delta <- rnorm(p)
+ theta_0 <- c(1, 1.2, -1)
+ y <- c(
+ rpois(300, exp(x[1:300, ] %*% theta_0)),
+ rpois(400, exp(x[301:700, ] %*% (theta_0 + delta))),
+ rpois(300, exp(x[701:1000, ] %*% theta_0)),
+ rpois(100, exp(x[1001:1100, ] %*% (theta_0 - delta))),
+ rpois(200, exp(x[1101:1300, ] %*% theta_0)),
+ rpois(200, exp(x[1301:1500, ] %*% (theta_0 + delta)))
+ )
+ result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ beta = (p + 1) * log(1500) / 2,
+ k = function(x) 0,
+ family = "poisson",
+ epsilon = 1e-5
+ )
+ summary(result)
+ # Penalized linear regression
+ library(fastcpd)
+ set.seed(1)
+ n <- 1500
+ p_true <- 6
+ p <- 50
+ x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p))
+ theta_0 <- rbind(
+ runif(p_true, -5, -2),
+ runif(p_true, -3, 3),
+ runif(p_true, 2, 5),
+ runif(p_true, -5, 5)
+ )
+ theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4))
+ y <- c(
+ x[1:300, ] %*% theta_0[1, ] + rnorm(300, 0, 1),
+ x[301:700, ] %*% theta_0[2, ] + rnorm(400, 0, 1),
+ x[701:1000, ] %*% theta_0[3, ] + rnorm(300, 0, 1),
+ x[1001:1500, ] %*% theta_0[4, ] + rnorm(500, 0, 1)
+ )
+ result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ family = "lasso"
+ )
+ plot(result)
+ summary(result)
+ # Custom cost function: logistic regression
+ library(fastcpd)
+ set.seed(1)
+ p <- 5
+ x <- matrix(rnorm(375 * p, 0, 1), ncol = p)
+ theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1))
+ y <- c(
+ rbinom(200, 1, 1 / (1 + exp(-x[1:200, ] %*% theta[1, ]))),
+ rbinom(175, 1, 1 / (1 + exp(-x[201:375, ] %*% theta[2, ])))
+ )
+ data <- data.frame(y = y, x = x)
+ result_builtin <- fastcpd(
+ formula = y ~ . - 1,
+ data = data,
+ family = "binomial"
+ )
+ logistic_loss <- function(data, theta) {
+ x <- data[, -1]
+ y <- data[, 1]
+ u <- x %*% theta
+ nll <- -y * u + log(1 + exp(u))
+ nll[u > 10] <- -y[u > 10] * u[u > 10] + u[u > 10]
+ sum(nll)
+ }
+ logistic_loss_gradient <- function(data, theta) {
+ x <- data[nrow(data), -1]
+ y <- data[nrow(data), 1]
+ c(-(y - 1 / (1 + exp(-x %*% theta)))) * x
+ }
+ logistic_loss_hessian <- function(data, theta) {
+ x <- data[nrow(data), -1]
+ prob <- 1 / (1 + exp(-x %*% theta))
+ (x %o% x) * c((1 - prob) * prob)
+ }
+ result_custom <- fastcpd(
+ formula = y ~ . - 1,
+ data = data,
+ epsilon = 1e-5,
+ cost = logistic_loss,
+ cost_gradient = logistic_loss_gradient,
+ cost_hessian = logistic_loss_hessian
+ )
+ cat(
+ "Change points detected by built-in logistic regression model: ",
+ result_builtin@cp_set, "\n",
+ "Change points detected by custom logistic regression model: ",
+ result_custom@cp_set, "\n",
+ sep = ""
+ )
+ # Custom cost function: mean shift
+ library(fastcpd)
+ set.seed(1)
+ p <- 1
+ data <- rbind(
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(100, p)),
+ mvtnorm::rmvnorm(400, mean = rep(50, p), sigma = diag(100, p)),
+ mvtnorm::rmvnorm(300, mean = rep(2, p), sigma = diag(100, p))
+ )
+ segment_count_guess <- 10
+ block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2)
+ block_count <- floor(nrow(data) / block_size)
+ data_all_vars <- rep(0, block_count)
+ for (block_index in seq_len(block_count)) {
+ block_start <- (block_index - 1) * block_size + 1
+ block_end <- if (block_index < block_count) block_index * block_size else nrow(data)
+ data_all_vars[block_index] <- var(data[block_start:block_end, ])
+ }
+ data_all_var <- mean(data_all_vars)
+ mean_loss <- function(data) {
+ n <- nrow(data)
+ (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var +
+ n / 2 * (log(data_all_var) + log(2 * pi))
+ }
+ mean_loss_result <- fastcpd(
+ formula = ~ . - 1,
+ data = data.frame(data),
+ beta = (p + 1) * log(nrow(data)) / 2,
+ p = p,
+ cost = mean_loss
+ )
+ summary(mean_loss_result)
+ # Custom cost function: variance change
+ library(fastcpd)
+ set.seed(1)
+ p <- 1
+ data <- rbind.data.frame(
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(400, mean = rep(0, p), sigma = diag(50, p)),
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(2, p))
+ )
+ data_all_mu <- colMeans(data)
+ var_loss <- function(data) {
+ demeaned_data_norm <- norm(sweep(data, 2, data_all_mu), type = "F")
+ nrow(data) * (1 + log(2 * pi) + log(demeaned_data_norm^2 / nrow(data))) / 2
+ }
+ var_loss_result <- fastcpd(
+ formula = ~ . - 1,
+ data = data,
+ beta = (p + 1) * log(nrow(data)) / 2,
+ p = p,
+ cost = var_loss
+ )
+ summary(var_loss_result)
+ # Custom cost function: mean shift and variance change
+ library(fastcpd)
+ set.seed(1)
+ p <- 1
+ data <- rbind.data.frame(
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(50, p)),
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(300, mean = rep(10, p), sigma = diag(50, p))
+ )
+ meanvar_loss <- function(data) {
+ loss_part <- (colSums(data^2) - colSums(data)^2 / nrow(data)) / nrow(data)
+ nrow(data) * (1 + log(2 * pi) + log(loss_part)) / 2
+ }
+ meanvar_loss_result <- fastcpd(
+ formula = ~ . - 1,
+ data = data,
+ beta = (2 * p + 1) * log(nrow(data)) / 2,
+ p = 2 * p,
+ cost = meanvar_loss
+ )
+ summary(meanvar_loss_result)
+ # Custom cost function: Huber loss
+ library(fastcpd)
+ set.seed(1)
+ n <- 400 + 300 + 500
+ p <- 5
+ x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(p))
+ theta <- rbind(
+ mvtnorm::rmvnorm(1, mean = rep(0, p - 3), sigma = diag(p - 3)),
+ mvtnorm::rmvnorm(1, mean = rep(5, p - 3), sigma = diag(p - 3)),
+ mvtnorm::rmvnorm(1, mean = rep(9, p - 3), sigma = diag(p - 3))
+ )
+ theta <- cbind(theta, matrix(0, 3, 3))
+ theta <- theta[rep(seq_len(3), c(400, 300, 500)), ]
+ y_true <- rowSums(x * theta)
+ factor <- c(
+ 2 * stats::rbinom(400, size = 1, prob = 0.95) - 1,
+ 2 * stats::rbinom(300, size = 1, prob = 0.95) - 1,
+ 2 * stats::rbinom(500, size = 1, prob = 0.95) - 1
+ )
+ y <- factor * y_true + stats::rnorm(n)
+ data <- cbind.data.frame(y, x)
+ huber_threshold <- 1
+ huber_loss <- function(data, theta) {
+ residual <- data[, 1] - data[, -1, drop = FALSE] %*% theta
+ indicator <- abs(residual) <= huber_threshold
+ sum(
+ residual^2 / 2 * indicator +
+ huber_threshold * (abs(residual) - huber_threshold / 2) * (1 - indicator)
+ )
+ }
+ huber_loss_gradient <- function(data, theta) {
+ residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta)
+ if (abs(residual) <= huber_threshold) {
+ -residual * data[nrow(data), -1]
+ } else {
+ -huber_threshold * sign(residual) * data[nrow(data), -1]
+ }
+ }
+ huber_loss_hessian <- function(data, theta) {
+ residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta)
+ if (abs(residual) <= huber_threshold) {
+ outer(data[nrow(data), -1], data[nrow(data), -1])
+ } else {
+ 0.01 * diag(length(theta))
+ }
+ }
+ huber_regression_result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data,
+ beta = (p + 1) * log(n) / 2,
+ cost = huber_loss,
+ cost_gradient = huber_loss_gradient,
+ cost_hessian = huber_loss_hessian
+ )
+ summary(huber_regression_result)
+ ```
+
## Installation
diff --git a/README.md b/README.md
index 1fefe983..d69c9fef 100644
--- a/README.md
+++ b/README.md
@@ -28,8 +28,483 @@ to:
Change-Point
Analysis](https://proceedings.mlr.press/v206/zhang23b.html)
-[documentation &
-examples](https://fastcpd.xingchi.li/reference/fastcpd.html#ref-examples)
+## Examples
+
+[Documentation](https://fastcpd.xingchi.li/reference/fastcpd.html#ref-examples)
+
+
+
+
+Example usage of `fastcpd`
+
+
+``` r
+ # Linear regression
+ library(fastcpd)
+ set.seed(1)
+ p <- 3
+ x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p))
+ theta_0 <- rbind(c(1, 1.2, -1), c(-1, 0, 0.5), c(0.5, -0.3, 0.2))
+ y <- c(
+ x[1:100, ] %*% theta_0[1, ] + rnorm(100, 0, 1),
+ x[101:200, ] %*% theta_0[2, ] + rnorm(100, 0, 1),
+ x[201:300, ] %*% theta_0[3, ] + rnorm(100, 0, 1)
+ )
+ result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ family = "gaussian"
+ )
+ plot(result)
+```
+
+![](man/figures/README-examples-1.png)
+
+``` r
+ summary(result)
+ #>
+ #> Call:
+ #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x),
+ #> family = "gaussian")
+ #>
+ #> Change points:
+ #> 98 202
+ #>
+ #> Cost values:
+ #> 53.44023 53.1441 45.04974
+ #>
+ #> Parameters:
+ #> segment 1 segment 2 segment 3
+ #> 1 0.9704022 -1.07884004 0.5925092
+ #> 2 1.1786074 -0.01757927 -0.5287126
+ #> 3 -0.9258587 0.63906143 0.1929411
+ # Linear regression with one-dimensional covariate
+ library(fastcpd)
+ set.seed(1)
+ p <- 1
+ x <- mvtnorm::rmvnorm(300, rep(0, p), diag(p))
+ theta_0 <- matrix(c(1, -1, 0.5))
+ y <- c(
+ x[1:100, ] * theta_0[1, ] + rnorm(100, 0, 1),
+ x[101:200, ] * theta_0[2, ] + rnorm(100, 0, 1),
+ x[201:300, ] * theta_0[3, ] + rnorm(100, 0, 1)
+ )
+ result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ family = "gaussian"
+ )
+ plot(result)
+```
+
+![](man/figures/README-examples-2.png)
+
+``` r
+ summary(result)
+ #>
+ #> Call:
+ #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x),
+ #> family = "gaussian")
+ #>
+ #> Change points:
+ #> 100 194
+ #>
+ #> Cost values:
+ #> 48.71927 57.20738 63.15088
+ #>
+ #> Parameters:
+ #> segment 1 segment 2 segment 3
+ #> 1 0.9520606 -0.8054074 0.3692224
+ # Logistic regression
+ library(fastcpd)
+ set.seed(1)
+ x <- matrix(rnorm(1500, 0, 1), ncol = 5)
+ theta <- rbind(rnorm(5, 0, 1), rnorm(5, 2, 1))
+ y <- c(
+ rbinom(125, 1, 1 / (1 + exp(-x[1:125, ] %*% theta[1, ]))),
+ rbinom(175, 1, 1 / (1 + exp(-x[126:300, ] %*% theta[2, ])))
+ )
+ result <- suppressWarnings(fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ family = "binomial"
+ ))
+ summary(result)
+ #>
+ #> Call:
+ #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x),
+ #> family = "binomial")
+ #>
+ #> Change points:
+ #> 126
+ #>
+ #> Cost values:
+ #> 56.90525 30.76875
+ #>
+ #> Parameters:
+ #> segment 1 segment 2
+ #> 1 0.7259293 1.878525
+ #> 2 -1.0294802 2.704376
+ #> 3 1.0576503 3.702310
+ #> 4 -0.8812767 2.258796
+ #> 5 0.2419351 2.524173
+ # Poisson regression
+ library(fastcpd)
+ set.seed(1)
+ p <- 3
+ x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p))
+ delta <- rnorm(p)
+ theta_0 <- c(1, 1.2, -1)
+ y <- c(
+ rpois(300, exp(x[1:300, ] %*% theta_0)),
+ rpois(400, exp(x[301:700, ] %*% (theta_0 + delta))),
+ rpois(300, exp(x[701:1000, ] %*% theta_0)),
+ rpois(100, exp(x[1001:1100, ] %*% (theta_0 - delta))),
+ rpois(200, exp(x[1101:1300, ] %*% theta_0)),
+ rpois(200, exp(x[1301:1500, ] %*% (theta_0 + delta)))
+ )
+ result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ beta = (p + 1) * log(1500) / 2,
+ k = function(x) 0,
+ family = "poisson",
+ epsilon = 1e-5
+ )
+ summary(result)
+ #>
+ #> Call:
+ #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x),
+ #> beta = (p + 1) * log(1500)/2, k = function(x) 0, family = "poisson",
+ #> epsilon = 1e-05)
+ #>
+ #> Change points:
+ #> 329 728 1021 1107 1325
+ #>
+ #> Cost values:
+ #> 14425.87 13971.23 697.2187 107.5353 380.7153 51.93594
+ #>
+ #> Parameters:
+ #> segment 1 segment 2 segment 3 segment 4 segment 5 segment 6
+ #> 1 2.60927673 1.9255183 0.7405125 -0.3965022 1.117753 2.5479308
+ #> 2 0.02398457 0.1068924 1.4721444 1.8677797 1.019035 0.4947115
+ #> 3 -1.34361104 -2.7353603 -0.8906937 0.4651667 -1.178933 -2.5038966
+ # Penalized linear regression
+ library(fastcpd)
+ set.seed(1)
+ n <- 1500
+ p_true <- 6
+ p <- 50
+ x <- mvtnorm::rmvnorm(1500, rep(0, p), diag(p))
+ theta_0 <- rbind(
+ runif(p_true, -5, -2),
+ runif(p_true, -3, 3),
+ runif(p_true, 2, 5),
+ runif(p_true, -5, 5)
+ )
+ theta_0 <- cbind(theta_0, matrix(0, ncol = p - p_true, nrow = 4))
+ y <- c(
+ x[1:300, ] %*% theta_0[1, ] + rnorm(300, 0, 1),
+ x[301:700, ] %*% theta_0[2, ] + rnorm(400, 0, 1),
+ x[701:1000, ] %*% theta_0[3, ] + rnorm(300, 0, 1),
+ x[1001:1500, ] %*% theta_0[4, ] + rnorm(500, 0, 1)
+ )
+ result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data.frame(y = y, x = x),
+ family = "lasso"
+ )
+ plot(result)
+```
+
+![](man/figures/README-examples-3.png)
+
+``` r
+ summary(result)
+ #>
+ #> Call:
+ #> fastcpd(formula = y ~ . - 1, data = data.frame(y = y, x = x),
+ #> family = "lasso")
+ #>
+ #> Change points:
+ #> 300 701 1000
+ #>
+ #> Cost values:
+ #> 181.7214 237.5852 164.4635 279.2901
+ #>
+ #> Parameters:
+ #> 50 x 4 sparse Matrix of class "dgCMatrix"
+ #> segment 1 segment 2 segment 3 segment 4
+ #> [1,] -2.950291 0.3790728 4.108208 -0.04955231
+ #> [2,] -2.896748 -0.3979970 3.952567 3.15522854
+ #> [3,] -2.875025 -0.2419562 2.634441 2.82389558
+ #> [4,] -1.982441 0.5145819 3.370837 -0.58643066
+ #> [5,] -3.116640 -0.5223081 2.140809 -3.42307221
+ #> [6,] -1.908975 0.4789674 4.852307 .
+ #> [7,] . . . .
+ #> [8,] . . . .
+ #> [9,] . . . .
+ #> [10,] . . . .
+ #> [11,] . . . .
+ #> [12,] . . . .
+ #> [13,] . . . .
+ #> [14,] . . . .
+ #> [15,] . . . .
+ #> [16,] . . . .
+ #> [17,] . . . .
+ #> [18,] . . . .
+ #> [19,] . . . .
+ #> [20,] . . . .
+ #> [21,] . . . .
+ #> [22,] . . . .
+ #> [23,] . . . .
+ #> [24,] . . . .
+ #> [25,] . . . .
+ #> [26,] . . . .
+ #> [27,] . . . .
+ #> [28,] . . . .
+ #> [29,] . . . .
+ #> [30,] . . . .
+ #> [31,] . . . .
+ #> [32,] . . . .
+ #> [33,] . . . .
+ #> [34,] . . . .
+ #> [35,] . . . .
+ #> [36,] . . . .
+ #> [37,] . . . .
+ #> [38,] . . . .
+ #> [39,] . . . .
+ #> [40,] . . . .
+ #> [41,] . . . .
+ #> [42,] . . . .
+ #> [43,] . . . .
+ #> [44,] . . . .
+ #> [45,] . . . .
+ #> [46,] . . . .
+ #> [47,] . . . .
+ #> [48,] . . . .
+ #> [49,] . . . .
+ #> [50,] . . . .
+ # Custom cost function: logistic regression
+ library(fastcpd)
+ set.seed(1)
+ p <- 5
+ x <- matrix(rnorm(375 * p, 0, 1), ncol = p)
+ theta <- rbind(rnorm(p, 0, 1), rnorm(p, 2, 1))
+ y <- c(
+ rbinom(200, 1, 1 / (1 + exp(-x[1:200, ] %*% theta[1, ]))),
+ rbinom(175, 1, 1 / (1 + exp(-x[201:375, ] %*% theta[2, ])))
+ )
+ data <- data.frame(y = y, x = x)
+ result_builtin <- fastcpd(
+ formula = y ~ . - 1,
+ data = data,
+ family = "binomial"
+ )
+ #> Warning: fit_glm: fitted probabilities numerically 0 or 1 occurred
+
+ #> Warning: fit_glm: fitted probabilities numerically 0 or 1 occurred
+
+ #> Warning: fit_glm: fitted probabilities numerically 0 or 1 occurred
+ logistic_loss <- function(data, theta) {
+ x <- data[, -1]
+ y <- data[, 1]
+ u <- x %*% theta
+ nll <- -y * u + log(1 + exp(u))
+ nll[u > 10] <- -y[u > 10] * u[u > 10] + u[u > 10]
+ sum(nll)
+ }
+ logistic_loss_gradient <- function(data, theta) {
+ x <- data[nrow(data), -1]
+ y <- data[nrow(data), 1]
+ c(-(y - 1 / (1 + exp(-x %*% theta)))) * x
+ }
+ logistic_loss_hessian <- function(data, theta) {
+ x <- data[nrow(data), -1]
+ prob <- 1 / (1 + exp(-x %*% theta))
+ (x %o% x) * c((1 - prob) * prob)
+ }
+ result_custom <- fastcpd(
+ formula = y ~ . - 1,
+ data = data,
+ epsilon = 1e-5,
+ cost = logistic_loss,
+ cost_gradient = logistic_loss_gradient,
+ cost_hessian = logistic_loss_hessian
+ )
+ cat(
+ "Change points detected by built-in logistic regression model: ",
+ result_builtin@cp_set, "\n",
+ "Change points detected by custom logistic regression model: ",
+ result_custom@cp_set, "\n",
+ sep = ""
+ )
+ #> Change points detected by built-in logistic regression model: 200
+ #> Change points detected by custom logistic regression model: 201
+ # Custom cost function: mean shift
+ library(fastcpd)
+ set.seed(1)
+ p <- 1
+ data <- rbind(
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(100, p)),
+ mvtnorm::rmvnorm(400, mean = rep(50, p), sigma = diag(100, p)),
+ mvtnorm::rmvnorm(300, mean = rep(2, p), sigma = diag(100, p))
+ )
+ segment_count_guess <- 10
+ block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2)
+ block_count <- floor(nrow(data) / block_size)
+ data_all_vars <- rep(0, block_count)
+ for (block_index in seq_len(block_count)) {
+ block_start <- (block_index - 1) * block_size + 1
+ block_end <- if (block_index < block_count) block_index * block_size else nrow(data)
+ data_all_vars[block_index] <- var(data[block_start:block_end, ])
+ }
+ data_all_var <- mean(data_all_vars)
+ mean_loss <- function(data) {
+ n <- nrow(data)
+ (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var +
+ n / 2 * (log(data_all_var) + log(2 * pi))
+ }
+ mean_loss_result <- fastcpd(
+ formula = ~ . - 1,
+ data = data.frame(data),
+ beta = (p + 1) * log(nrow(data)) / 2,
+ p = p,
+ cost = mean_loss
+ )
+ summary(mean_loss_result)
+ #>
+ #> Call:
+ #> fastcpd(formula = ~. - 1, data = data.frame(data), beta = (p +
+ #> 1) * log(nrow(data))/2, p = p, cost = mean_loss)
+ #>
+ #> Change points:
+ #> 300 700
+ # Custom cost function: variance change
+ library(fastcpd)
+ set.seed(1)
+ p <- 1
+ data <- rbind.data.frame(
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(400, mean = rep(0, p), sigma = diag(50, p)),
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(2, p))
+ )
+ data_all_mu <- colMeans(data)
+ var_loss <- function(data) {
+ demeaned_data_norm <- norm(sweep(data, 2, data_all_mu), type = "F")
+ nrow(data) * (1 + log(2 * pi) + log(demeaned_data_norm^2 / nrow(data))) / 2
+ }
+ var_loss_result <- fastcpd(
+ formula = ~ . - 1,
+ data = data,
+ beta = (p + 1) * log(nrow(data)) / 2,
+ p = p,
+ cost = var_loss
+ )
+ summary(var_loss_result)
+ #>
+ #> Call:
+ #> fastcpd(formula = ~. - 1, data = data, beta = (p + 1) * log(nrow(data))/2,
+ #> p = p, cost = var_loss)
+ #>
+ #> Change points:
+ #> 300 699
+ # Custom cost function: mean shift and variance change
+ library(fastcpd)
+ set.seed(1)
+ p <- 1
+ data <- rbind.data.frame(
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(50, p)),
+ mvtnorm::rmvnorm(300, mean = rep(0, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(400, mean = rep(10, p), sigma = diag(1, p)),
+ mvtnorm::rmvnorm(300, mean = rep(10, p), sigma = diag(50, p))
+ )
+ meanvar_loss <- function(data) {
+ loss_part <- (colSums(data^2) - colSums(data)^2 / nrow(data)) / nrow(data)
+ nrow(data) * (1 + log(2 * pi) + log(loss_part)) / 2
+ }
+ meanvar_loss_result <- fastcpd(
+ formula = ~ . - 1,
+ data = data,
+ beta = (2 * p + 1) * log(nrow(data)) / 2,
+ p = 2 * p,
+ cost = meanvar_loss
+ )
+ summary(meanvar_loss_result)
+ #>
+ #> Call:
+ #> fastcpd(formula = ~. - 1, data = data, beta = (2 * p + 1) * log(nrow(data))/2,
+ #> p = 2 * p, cost = meanvar_loss)
+ #>
+ #> Change points:
+ #> 300 700 1000 1300 1700
+ # Custom cost function: Huber loss
+ library(fastcpd)
+ set.seed(1)
+ n <- 400 + 300 + 500
+ p <- 5
+ x <- mvtnorm::rmvnorm(n, mean = rep(0, p), sigma = diag(p))
+ theta <- rbind(
+ mvtnorm::rmvnorm(1, mean = rep(0, p - 3), sigma = diag(p - 3)),
+ mvtnorm::rmvnorm(1, mean = rep(5, p - 3), sigma = diag(p - 3)),
+ mvtnorm::rmvnorm(1, mean = rep(9, p - 3), sigma = diag(p - 3))
+ )
+ theta <- cbind(theta, matrix(0, 3, 3))
+ theta <- theta[rep(seq_len(3), c(400, 300, 500)), ]
+ y_true <- rowSums(x * theta)
+ factor <- c(
+ 2 * stats::rbinom(400, size = 1, prob = 0.95) - 1,
+ 2 * stats::rbinom(300, size = 1, prob = 0.95) - 1,
+ 2 * stats::rbinom(500, size = 1, prob = 0.95) - 1
+ )
+ y <- factor * y_true + stats::rnorm(n)
+ data <- cbind.data.frame(y, x)
+ huber_threshold <- 1
+ huber_loss <- function(data, theta) {
+ residual <- data[, 1] - data[, -1, drop = FALSE] %*% theta
+ indicator <- abs(residual) <= huber_threshold
+ sum(
+ residual^2 / 2 * indicator +
+ huber_threshold * (abs(residual) - huber_threshold / 2) * (1 - indicator)
+ )
+ }
+ huber_loss_gradient <- function(data, theta) {
+ residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta)
+ if (abs(residual) <= huber_threshold) {
+ -residual * data[nrow(data), -1]
+ } else {
+ -huber_threshold * sign(residual) * data[nrow(data), -1]
+ }
+ }
+ huber_loss_hessian <- function(data, theta) {
+ residual <- c(data[nrow(data), 1] - data[nrow(data), -1] %*% theta)
+ if (abs(residual) <= huber_threshold) {
+ outer(data[nrow(data), -1], data[nrow(data), -1])
+ } else {
+ 0.01 * diag(length(theta))
+ }
+ }
+ huber_regression_result <- fastcpd(
+ formula = y ~ . - 1,
+ data = data,
+ beta = (p + 1) * log(n) / 2,
+ cost = huber_loss,
+ cost_gradient = huber_loss_gradient,
+ cost_hessian = huber_loss_hessian
+ )
+ summary(huber_regression_result)
+ #>
+ #> Call:
+ #> fastcpd(formula = y ~ . - 1, data = data, beta = (p + 1) * log(n)/2,
+ #> cost = huber_loss, cost_gradient = huber_loss_gradient, cost_hessian = huber_loss_hessian)
+ #>
+ #> Change points:
+ #> 401 726
+```
+
+
## Installation
diff --git a/man/figures/README-examples-1.png b/man/figures/README-examples-1.png
new file mode 100644
index 00000000..79778f42
Binary files /dev/null and b/man/figures/README-examples-1.png differ
diff --git a/man/figures/README-examples-2.png b/man/figures/README-examples-2.png
new file mode 100644
index 00000000..78520f99
Binary files /dev/null and b/man/figures/README-examples-2.png differ
diff --git a/man/figures/README-examples-3.png b/man/figures/README-examples-3.png
new file mode 100644
index 00000000..2de50605
Binary files /dev/null and b/man/figures/README-examples-3.png differ
diff --git a/man/figures/manual.pdf b/man/figures/manual.pdf
new file mode 100644
index 00000000..1cfc5b2e
Binary files /dev/null and b/man/figures/manual.pdf differ
diff --git a/tests/testthat/test-fastcpd.R b/tests/testthat/test-fastcpd.R
index 89b27976..be51d3e5 100644
--- a/tests/testthat/test-fastcpd.R
+++ b/tests/testthat/test-fastcpd.R
@@ -461,80 +461,158 @@ test_that("huber regression with 0.1 vanilla", {
expect_equal(huber_regression_result@cp_set, c(401, 726))
})
-test_that("confidence interval experiment", {
- set.seed(1)
- kDimension <- 1
- change_point_locations <- NULL
- for (experiment_id in seq_len(20)) {
- data <- rbind(
- mvtnorm::rmvnorm(300, mean = rep(0, kDimension), sigma = diag(100, kDimension)),
- mvtnorm::rmvnorm(400, mean = rep(50, kDimension), sigma = diag(100, kDimension)),
- mvtnorm::rmvnorm(300, mean = rep(2, kDimension), sigma = diag(100, kDimension))
- )
- segment_count_guess <- 10
- block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2)
- block_count <- floor(nrow(data) / block_size)
- data_all_vars <- rep(0, block_count)
- for (block_index in seq_len(block_count)) {
- block_start <- (block_index - 1) * block_size + 1
- block_end <- if (block_index < block_count) block_index * block_size else nrow(data)
- data_all_vars[block_index] <- var(data[block_start:block_end, ])
- }
- data_all_var <- mean(data_all_vars)
- mean_loss <- function(data) {
- n <- nrow(data)
- (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var +
- n / 2 * (log(data_all_var) + log(2 * pi))
- }
- mean_loss_result <- fastcpd(
- formula = ~ . - 1,
- data = data.frame(data),
- beta = (kDimension + 1) * log(nrow(data)) / 2,
- p = kDimension,
- cost = mean_loss
- )
- change_point_locations <- c(change_point_locations, mean_loss_result@cp_set)
- }
-
- change_point_locations_cookie_bucket <- NULL
- cookie_bucket_id_list <- sample.int(n = 20, size = 1000, replace = TRUE)
- all_data <- data
- for (cookie_bucket_id in seq_len(20)) {
- data <- all_data[cookie_bucket_id_list != cookie_bucket_id, , drop = FALSE]
- segment_count_guess <- 10
- block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2)
- block_count <- floor(nrow(data) / block_size)
- data_all_vars <- rep(0, block_count)
- for (block_index in seq_len(block_count)) {
- block_start <- (block_index - 1) * block_size + 1
- block_end <- if (block_index < block_count) block_index * block_size else nrow(data)
- data_all_vars[block_index] <- var(data[block_start:block_end, ])
- }
- data_all_var <- mean(data_all_vars)
- mean_loss <- function(data) {
- n <- nrow(data)
- (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var +
- n / 2 * (log(data_all_var) + log(2 * pi))
- }
- mean_loss_result <- fastcpd(
- formula = ~ . - 1,
- data = data.frame(data),
- beta = (kDimension + 1) * log(nrow(data)) / 2,
- p = kDimension,
- cost = mean_loss
- )
-
- for (cp in mean_loss_result@cp_set) {
- ordinal_mapped_cp <- which(cumsum(cookie_bucket_id_list != cookie_bucket_id) == cp)[1]
- change_point_locations_cookie_bucket <- c(change_point_locations_cookie_bucket, ordinal_mapped_cp)
- }
- }
-
- table_change_point_locations <- table(change_point_locations)
- expect_equal(rownames(table_change_point_locations), c("269", "299", "300", "700"))
- expect_equal(unname(table_change_point_locations), c(1, 1, 19, 20), ignore_attr = TRUE)
-
- table_change_point_locations_cookie_bucket <- table(change_point_locations_cookie_bucket)
- expect_equal(rownames(table_change_point_locations_cookie_bucket), c("299", "300", "697", "700"))
- expect_equal(unname(table_change_point_locations_cookie_bucket), c(1, 19, 1, 19), ignore_attr = TRUE)
-})
+# test_that("confidence interval experiment", {
+# set.seed(1)
+# kDimension <- 1
+# change_point_locations <- NULL
+# for (experiment_id in seq_len(20)) {
+# data <- rbind(
+# mvtnorm::rmvnorm(300, mean = rep(0, kDimension), sigma = diag(100, kDimension)),
+# mvtnorm::rmvnorm(400, mean = rep(50, kDimension), sigma = diag(100, kDimension)),
+# mvtnorm::rmvnorm(300, mean = rep(2, kDimension), sigma = diag(100, kDimension))
+# )
+# segment_count_guess <- 10
+# block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2)
+# block_count <- floor(nrow(data) / block_size)
+# data_all_vars <- rep(0, block_count)
+# for (block_index in seq_len(block_count)) {
+# block_start <- (block_index - 1) * block_size + 1
+# block_end <- if (block_index < block_count) block_index * block_size else nrow(data)
+# data_all_vars[block_index] <- var(data[block_start:block_end, ])
+# }
+# data_all_var <- mean(data_all_vars)
+# mean_loss <- function(data) {
+# n <- nrow(data)
+# (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var +
+# n / 2 * (log(data_all_var) + log(2 * pi))
+# }
+# mean_loss_result <- fastcpd(
+# formula = ~ . - 1,
+# data = data.frame(data),
+# beta = (kDimension + 1) * log(nrow(data)) / 2,
+# p = kDimension,
+# cost = mean_loss
+# )
+# change_point_locations <- c(change_point_locations, mean_loss_result@cp_set)
+# }
+
+# change_point_locations_cookie_bucket <- NULL
+# cookie_bucket_id_list <- sample.int(n = 20, size = 1000, replace = TRUE)
+# all_data <- data
+# for (cookie_bucket_id in seq_len(20)) {
+# data <- all_data[cookie_bucket_id_list != cookie_bucket_id, , drop = FALSE]
+# segment_count_guess <- 10
+# block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2)
+# block_count <- floor(nrow(data) / block_size)
+# data_all_vars <- rep(0, block_count)
+# for (block_index in seq_len(block_count)) {
+# block_start <- (block_index - 1) * block_size + 1
+# block_end <- if (block_index < block_count) block_index * block_size else nrow(data)
+# data_all_vars[block_index] <- var(data[block_start:block_end, ])
+# }
+# data_all_var <- mean(data_all_vars)
+# mean_loss <- function(data) {
+# n <- nrow(data)
+# (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var +
+# n / 2 * (log(data_all_var) + log(2 * pi))
+# }
+# mean_loss_result <- fastcpd(
+# formula = ~ . - 1,
+# data = data.frame(data),
+# beta = (kDimension + 1) * log(nrow(data)) / 2,
+# p = kDimension,
+# cost = mean_loss
+# )
+
+# for (cp in mean_loss_result@cp_set) {
+# ordinal_mapped_cp <- which(cumsum(cookie_bucket_id_list != cookie_bucket_id) == cp)[1]
+# change_point_locations_cookie_bucket <- c(change_point_locations_cookie_bucket, ordinal_mapped_cp)
+# }
+# }
+
+# table_change_point_locations <- table(change_point_locations)
+# expect_equal(rownames(table_change_point_locations), c("269", "299", "300", "700"))
+# expect_equal(unname(table_change_point_locations), c(1, 1, 19, 20), ignore_attr = TRUE)
+
+# table_change_point_locations_cookie_bucket <- table(change_point_locations_cookie_bucket)
+# expect_equal(rownames(table_change_point_locations_cookie_bucket), c("299", "300", "697", "700"))
+# expect_equal(unname(table_change_point_locations_cookie_bucket), c(1, 19, 1, 19), ignore_attr = TRUE)
+# })
+
+# test_that("confidence interval experiment with one change point", {
+# set.seed(1)
+# kDimension <- 1
+# change_point_locations <- list()
+# change_point_locations_cookie_bucket <- rep(list(NULL), 1000)
+# containing_change_point <- rep(FALSE, 1000)
+# for (experiment_id in seq_len(1000)) {
+# data <- rbind(
+# mvtnorm::rmvnorm(50, mean = rep(0, kDimension), sigma = diag(100, kDimension)),
+# mvtnorm::rmvnorm(50, mean = rep(50, kDimension), sigma = diag(100, kDimension))
+# )
+# segment_count_guess <- 10
+# block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2)
+# block_count <- floor(nrow(data) / block_size)
+# data_all_vars <- rep(0, block_count)
+# for (block_index in seq_len(block_count)) {
+# block_start <- (block_index - 1) * block_size + 1
+# block_end <- if (block_index < block_count) block_index * block_size else nrow(data)
+# data_all_vars[block_index] <- var(data[block_start:block_end, ])
+# }
+# data_all_var <- mean(data_all_vars)
+# mean_loss <- function(data) {
+# n <- nrow(data)
+# (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var +
+# n / 2 * (log(data_all_var) + log(2 * pi))
+# }
+# mean_loss_result <- fastcpd(
+# formula = ~ . - 1,
+# data = data.frame(data),
+# beta = (kDimension + 1) * log(nrow(data)) / 2,
+# p = kDimension,
+# cost = mean_loss
+# )
+# change_point_locations[[experiment_id]] <- mean_loss_result@cp_set
+
+# cookie_bucket_id_list <- sample.int(n = 20, size = 50 + 50, replace = TRUE)
+# all_data <- data
+# for (cookie_bucket_id in seq_len(20)) {
+# data <- all_data[cookie_bucket_id_list != cookie_bucket_id, , drop = FALSE]
+# segment_count_guess <- 10
+# block_size <- max(floor(sqrt(nrow(data)) / (segment_count_guess + 1)), 2)
+# block_count <- floor(nrow(data) / block_size)
+# data_all_vars <- rep(0, block_count)
+# for (block_index in seq_len(block_count)) {
+# block_start <- (block_index - 1) * block_size + 1
+# block_end <- if (block_index < block_count) block_index * block_size else nrow(data)
+# data_all_vars[block_index] <- var(data[block_start:block_end, ])
+# }
+# data_all_var <- mean(data_all_vars)
+# mean_loss <- function(data) {
+# n <- nrow(data)
+# (norm(data, type = "F")^2 - colSums(data)^2 / n) / 2 / data_all_var +
+# n / 2 * (log(data_all_var) + log(2 * pi))
+# }
+# mean_loss_result <- fastcpd(
+# formula = ~ . - 1,
+# data = data.frame(data),
+# beta = (kDimension + 1) * log(nrow(data)) / 2,
+# p = kDimension,
+# cost = mean_loss
+# )
+
+# for (cp in mean_loss_result@cp_set) {
+# ordinal_mapped_cp <- which(cumsum(cookie_bucket_id_list != cookie_bucket_id) == cp)[1]
+# change_point_locations_cookie_bucket[[experiment_id]] <- c(change_point_locations_cookie_bucket[[experiment_id]], ordinal_mapped_cp)
+# }
+# }
+# if (
+# quantile(change_point_locations_cookie_bucket[[experiment_id]], 0.025) <= 50 &&
+# quantile(change_point_locations_cookie_bucket[[experiment_id]], 0.975) >= 50
+# ) {
+# containing_change_point[experiment_id] <- TRUE
+# }
+# }
+
+# expect_equal(sum(containing_change_point), 927)
+# })