diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..26d1514 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,25 @@ +Package: knnp +Version: 1.0.0 +Date: 2018-06-18 +Title: Time Series Prediction using K-Nearest Neighbors Algorithm + (Parallel) +Authors@R: c( + person("Daniel", "Bastarrica Lacalle", email="danibast@ucm.es", role=c("aut")), + person("Javier", "Berdecio Trigueros", email="javierberdeciot@gmail.com", role=c("aut", "cre")) + ) +Depends: R (>= 3.3.3) +Imports: parallelDist, forecast, stats, utils, doParallel, foreach +Description: Two main functionalities are provided. One of them is predicting values with + k-nearest neighbors algorithm and the other is optimizing the parameters k and d of the algorithm. + These are carried out in parallel using multiple threads. +License: AGPL-3 +RoxygenNote: 6.0.1 +URL: https://github.com/Dani-Basta/TFG +BugReports: https://github.com/Dani-Basta/TFG/issues +NeedsCompilation: no +Packaged: 2018-06-18 04:15:01 UTC; javier +Author: Daniel Bastarrica Lacalle [aut], + Javier Berdecio Trigueros [aut, cre] +Maintainer: Javier Berdecio Trigueros +Repository: CRAN +Date/Publication: 2018-07-01 15:00:02 UTC diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..01cd63c --- /dev/null +++ b/MD5 @@ -0,0 +1,19 @@ +959faf2367e8343a11d14540d6a79e2a *DESCRIPTION +21a5cd3c7003d8c722ebef3867bdd70c *NAMESPACE +df60a570ff76d200c2f5f10314ac936a *NEWS.md +b70db3525ef00b36e4cfafdbb2451e1f *R/knn_distances.R +afbca240fc22fd11670cbf84420c1875 *R/knn_elements.R +4b4cfde192f48759147fba1567948a83 *R/knn_next.R +0670854a515ec04ff6828a44b6e68f19 *R/knn_optim.R +17fbd8144d41877fb50f3b3f4bb8e0aa *R/knn_optim_parallel.R +1a9c5e30c3a3762e0bdbe47b7b20e892 *R/knn_optim_parallel2.R +3c128712999e376d49fa65a156930a37 *R/knn_optim_parallelf.R +b7698a8076ed266064cbf29599f254a2 *R/knn_past.R +0cdeeae6ebeb075d9a8c2f53db3cc196 *man/knn_distances.Rd +357273707843f33aecf579b18633a465 *man/knn_elements.Rd +03be41ee392eceae25c60cb6db79308f *man/knn_next.Rd +ac69fe9d3c9f59fa326d3d22024ddef1 *man/knn_optim.Rd +dad5b7d9e21d91a5d08ff00de988d902 *man/knn_optim_parallel.Rd +e5e510d3cda1d310ee7607c39368f6ac *man/knn_optim_parallel2.Rd +3220a538d15044e6771cae6885352f44 *man/knn_optim_parallelf.Rd +d5b201bc2047d0ca9c0e1fd1c4326f28 *man/knn_past.Rd diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..9957a2a --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,10 @@ +# Generated by roxygen2: do not edit by hand + +export(knn_distances) +export(knn_next) +export(knn_optim) +export(knn_optim_parallel) +export(knn_optim_parallel2) +export(knn_optim_parallelf) +export(knn_past) +import(foreach) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..49bbb05 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,3 @@ +# knnp 1.0.0 +* Initial release + diff --git a/R/knn_distances.R b/R/knn_distances.R new file mode 100644 index 0000000..f9e0aa6 --- /dev/null +++ b/R/knn_distances.R @@ -0,0 +1,62 @@ +#' Distances matrixes computation and saving in files with a maximum of columns +#' +#' Calculates one distances matrix per each d for the given time series and then save them in files. Each file will +#' contain a maximum of 'cols' number of columns from the corresponding distances matrix. +#' +#' @param y A time series. +#' @param d Values of d's to be analyzed. +#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord". +#' @param threads Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or +#' 1 if there is only one core. +#' @param file Path and id of the files where the distances matrixes will be saved. +#' @param cols Number of columns per file. +#' @examples +#' knn_distances(AirPassengers, 1:3, threads = 2, file = "AirPassengers", cols = 2) +#' knn_distances(LakeHuron, 1:6, threads = 2, file = "LakeHuron", cols = 10) +#' @export +knn_distances <- function(y, d, distance_metric = "euclidean", threads = NULL, file, cols = 1){ + + # Default number of threads to be used + if (is.null(threads)) { + cores <- parallel::detectCores() + threads <- ifelse(cores == 1, cores, cores - 1) + } + + # Initialization of variables to be used + y <- matrix(y, ncol = NCOL(y)) + n <- NROW(y) + + # Calculate one distances matrix for each d, as the distance variates + # with the number of values that characterizes each 'element'. This matrixes + # are saved in files. + for (act_d in d) { + # Get 'elements' matrix + elements_matrix <- knn_elements(y, act_d) + + # Calculate distances between every 'element', a 'triangular matrix' is returned + distances <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads) + + # Save distances matrix in files that will contain a determined number of columns + i <- 1 + num_of_file <- 1 + distances_length <- length(distances) + act_column_length <- n - act_d + while (i <= distances_length) { + initial_i <- i + j <- 1 + while (j <= cols && i <= distances_length) { + i <- i + act_column_length + act_column_length <- act_column_length - 1 + j <- j + 1 + } + saveRDS(distances[initial_i:(i - 1)], paste0(file, act_d, "_", num_of_file)) + num_of_file <- num_of_file + 1 + } + + } + +} diff --git a/R/knn_elements.R b/R/knn_elements.R new file mode 100644 index 0000000..9997f2c --- /dev/null +++ b/R/knn_elements.R @@ -0,0 +1,27 @@ +#' 'Elements' matrix computation +#' +#' Creates a matrix to be used for calculating distances. The most +#' recent 'element' is put in the first row of the matrix, the +#' second most recent 'element' in the second row and so on. Therefore, +#' the oldest 'element' is put in the last row. +#' +#' @param y A matrix. +#' @param d Length of each of the 'elements'. +#' @return A matrix to be used for calculating distances. +knn_elements <- function(y, d) { + n <- NROW(y) + m <- NCOL(y) + last_elem <- n - d + + # Fill matrix as described above, it is done vertically for efficiency reasons + elements_matrix <- matrix(nrow = last_elem + 1, ncol = d * m) + col <- 1 + for (i in 1:m) { + for (j in 1:d) { + elements_matrix[, col] <- rev(y[(j:(j + last_elem)), i]) + col <- col + 1 + } + } + + elements_matrix +} diff --git a/R/knn_next.R b/R/knn_next.R new file mode 100644 index 0000000..39079b0 --- /dev/null +++ b/R/knn_next.R @@ -0,0 +1,62 @@ +#' Next value prediction +#' +#' Predicts next value of the time series using k-nearest neighbors algorithm. +#' +#' @param y A time series. +#' @param k Number of neighbors. +#' @param d Length of each of the 'elements'. +#' @param v Variable to be predicted if given multivariate time series. +#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord". +#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean. +#' Three supported: proximity, same, linear. +#' \describe{ +#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance} +#' \item{same}{all neighbors are assigned with the same weight} +#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the +#' least nearest neighbor which is assigned with a weight of 1.} +#' } +#' @param threads Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or +#' 1 if there is only one core. +#' @return The predicted value. +#' @examples +#' knn_next(AirPassengers, 5, 2, threads = 2) +#' knn_next(LakeHuron, 3, 6, threads = 2) +#' @export +knn_next <- function(y, k, d, v = 1, distance_metric = "euclidean", weight = "proximity", threads = NULL) { + + # Default number of threads to be used + if (is.null(threads)) { + cores <- parallel::detectCores() + threads <- ifelse(cores == 1, cores, cores - 1) + } + + # Initialization of variables to be used + y <- matrix(y, ncol = NCOL(y)) + n <- NROW(y) + + # Get 'elements' matrix + elements_matrix <- knn_elements(y, d) + + # Calculate distances between every 'element', a 'triangular matrix' is returned + # Only the first column is used because it corresponds to the distances + # between the most recent 'element' and the rest of the 'elements' + distances <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads)[1:(n - d)] + + # Get the indexes of the k nearest 'elements', these are called neighbors + k_nn <- utils::head((sort.int(distances, index.return = TRUE))$ix, k) + + # Calculate the weights for the future computation of the weighted mean + weights <- switch(weight, + proximity = 1 / (distances[k_nn] + .Machine$double.xmin * 1e150), + same = rep.int(1, k), + linear = k:1) + + # Calculate the predicted value + prediction <- stats::weighted.mean(y[n - k_nn + 1, v], weights) + + prediction +} diff --git a/R/knn_optim.R b/R/knn_optim.R new file mode 100644 index 0000000..e038984 --- /dev/null +++ b/R/knn_optim.R @@ -0,0 +1,127 @@ +#' k and d optimization +#' +#' Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one +#' are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to +#' instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1 +#' to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +#' is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and +#' the real values of the series. +#' This version of the optimization function only uses one thread except for the distances matrixes calculation, for which the +#' number of threads to be used can be specified. +#' +#' @param y A time series. +#' @param k Values of k's to be analyzed. +#' @param d Values of d's to be analyzed. +#' @param v Variable to be predicted if given multivariate time series. +#' @param init Variable that determines the limit of the known past for the first instant predicted. +#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord". +#' @param error_metric Type of metric to evaluate the prediction error. +#' Five metrics supported: +#' \describe{ +#' \item{ME}{Mean Error} +#' \item{RMSE}{Root Mean Squared Error} +#' \item{MAE}{Mean Absolute Error} +#' \item{MPE}{Mean Percentage Error} +#' \item{MAPE}{Mean Absolute Percentage Error} +#' } +#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean. +#' Three supported: proximity, same, linear. +#' \describe{ +#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance} +#' \item{same}{all neighbors are assigned with the same weight} +#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the +#' least nearest neighbor which is assigned with a weight of 1.} +#' } +#' @param threads Number of threads to be used when parallelizing, default is number of cores detected - 1 or +#' 1 if there is only one core. +#' @return A matrix of errors, optimal k and d. +#' @examples +#' knn_optim(AirPassengers, 1:5, 1:3, threads = 2) +#' knn_optim(LakeHuron, 1:10, 1:6, threads = 2) +#' @export +knn_optim <- function(y, k, d, v = 1, init = NULL, distance_metric = "euclidean", error_metric = "MAE", weight = "proximity", threads = NULL){ + + # Default number of threads to be used + if (is.null(threads)) { + cores <- parallel::detectCores() + threads <- ifelse(cores == 1, cores, cores - 1) + } + + # Choose the appropiate index of the accuracy result, depending on the error_metric + error_type <- switch(error_metric, + ME = 1, + RMSE = 2, + MAE = 3, + MPE = 4, + MAPE = 5 + ) + + # Sort k or d vector if they are unsorted + if (is.unsorted(k)) { + k <- sort(k) + } + if (is.unsorted(d)) { + d <- sort(d) + } + + # Initialization of variables to be used + y <- matrix(y, ncol = NCOL(y)) + n <- NROW(y) + ks <- length(k) + ds <- length(d) + init <- ifelse(is.null(init), floor(n * 0.7), init) + real_values <- matrix(y[(init + 1):n, v]) + errors <- matrix(nrow = ks, ncol = ds, dimnames = list(k, d)) + + for (i in 1:ds) { + predictions <- matrix(nrow = ks, ncol = n - init) + + # Get 'elements' matrix + elements_matrix <- knn_elements(y, d[i]) + + # Calculate distances between every 'element', a 'triangular matrix' is returned + distances_matrix <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads) + distances_matrix_size <- attr(distances_matrix, "Size") + + for (j in (n - init + 1):2) { + # Get column needed from the distances matrix and sort it + initial_index <- distances_matrix_size * (j - 1) - j * (j - 1) / 2 + 1 + distances_col <- distances_matrix[initial_index:(initial_index + n - d[i] - j)] + sorted_distances_col <- sort.int(distances_col, index.return = TRUE) + + for (k_index in 1:ks) { + k_value <- k[k_index] + + # Get the indexes of the k nearest 'elements', these are called neighbors + k_nn <- utils::head(sorted_distances_col$ix, k_value) + + # Calculate the weights for the future computation of the weighted mean + weights <- switch(weight, + proximity = 1 / (distances_col[k_nn] + .Machine$double.xmin * 1e150), + same = rep.int(1, k_value), + linear = k_value:1) + + # Calculate the predicted value + predictions[k_index, n - init + 2 - j] <- stats::weighted.mean(y[n - j + 2 - k_nn, v], weights) + } + } + + # Calculate error values between the known values and the predicted values, these values + # correspond to instants init to n - 1. This is done for the current d and all k's + for (k_index in 1:ks) { + errors[k_index, i] <- forecast::accuracy(stats::ts(predictions[k_index, ]), real_values)[error_type] + } + } + + # Construction of the list to be returned + index_min_error <- which.min(errors) + opt_k <- k[((index_min_error - 1) %% ks) + 1] + opt_d <- d[ceiling(index_min_error / ks)] + result <- list(errors = errors, k = opt_k, d = opt_d) + + result +} diff --git a/R/knn_optim_parallel.R b/R/knn_optim_parallel.R new file mode 100644 index 0000000..aea6e06 --- /dev/null +++ b/R/knn_optim_parallel.R @@ -0,0 +1,152 @@ +#' Parallel k and d optimization +#' +#' Optimizes the values of K and D for a given time series. First, values corresponding to instants from init + 1 to the last one +#' are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to +#' instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1 +#' to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +#' is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and +#' the real values of the series. +#' This version of the optimization function uses a parallelized distances calculation function, and the computation of +#' the predicted values is done parallelizing by the number of d's and the number of instants to be predicted. +#' +#' @param y A time series. +#' @param k Values of k's to be analyzed. +#' @param d Values of d's to be analyzed. +#' @param v Variable to be predicted if given multivariate time series. +#' @param init Variable that determines the limit of the known past for the first instant predicted. +#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord". +#' @param error_metric Type of metric to evaluate the prediction error. +#' Five metrics supported: +#' \describe{ +#' \item{ME}{Mean Error} +#' \item{RMSE}{Root Mean Squared Error} +#' \item{MAE}{Mean Absolute Error} +#' \item{MPE}{Mean Percentage Error} +#' \item{MAPE}{Mean Absolute Percentage Error} +#' } +#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean. +#' Three supported: proximity, same, linear. +#' \describe{ +#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance} +#' \item{same}{all neighbors are assigned with the same weight} +#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the +#' least nearest neighbor which is assigned with a weight of 1.} +#' } +#' @param threads Number of threads to be used when parallelizing, default is number of cores detected - 1 or +#' 1 if there is only one core. +#' @return A matrix of errors, optimal k and d. +#' @examples +#' knn_optim_parallel(AirPassengers, 1:5, 1:3, threads = 2) +#' knn_optim_parallel(LakeHuron, 1:10, 1:6, threads = 2) +#' @import foreach +#' @export +knn_optim_parallel <- function(y, k, d, v = 1, init = NULL, distance_metric = "euclidean", error_metric = "MAE", weight = "proximity", threads = NULL){ + + # Default number of threads to be used + if (is.null(threads)) { + cores <- parallel::detectCores() + threads <- ifelse(cores == 1, cores, cores - 1) + } + + # Choose the appropiate index of the accuracy result, depending on the error_metric + error_type <- switch(error_metric, + ME = 1, + RMSE = 2, + MAE = 3, + MPE = 4, + MAPE = 5 + ) + + # Sort k or d vector if they are unsorted + if (is.unsorted(k)) { + k <- sort(k) + } + if (is.unsorted(d)) { + d <- sort(d) + } + + # Initialization of variables to be used + y <- matrix(y, ncol = NCOL(y)) + n <- NROW(y) + ks <- length(k) + ds <- length(d) + init <- ifelse(is.null(init), floor(n * 0.7), init) + real_values <- matrix(y[(init + 1):n, v]) + errors <- matrix(nrow = ks, ncol = ds, dimnames = list(k, d)) + distances_matrixes <- vector("list", ds) + distances_matrixes_sizes <- vector(mode = "numeric", ds) + + # This next line is only there to avoid 'No visible binding for global variable' warning + # in R CMD check due to j variable used in foreach loop + j <- NULL + + # Calculate one distances matrix for each d, as the distances variates + # with the number of values that characterize each 'element'. + for (i in 1:ds) { + # Get 'elements' matrix + elements_matrix <- knn_elements(y, d[i]) + + # Calculate distances between every 'element', a 'triangular matrix' is returned + distances_matrix <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads) + distances_matrixes[[i]] <- distances_matrix + distances_matrixes_sizes[i] <- attr(distances_matrix, "Size") + } + + # For each of the combinations of d's and instants init to n - 1, a distances vector + # according to each combination is taken from the corresponding distances matrix and then + # ordered. Later, the k's inner loop applies k-nn to predict values. + + clust <- parallel::makeCluster(threads) + doParallel::registerDoParallel(cl = clust) + +all_predictions <- foreach::foreach(i = 1:ds, .combine = cbind) %:% foreach::foreach(j = (n - init + 1):2, .combine = cbind) %dopar% { + predictions <- vector(mode = "numeric", ks) + + # Get column needed from the distances matrix and sort it + initial_index <- distances_matrixes_sizes[i] * (j - 1) - j * (j - 1) / 2 + 1 + distances_col <- distances_matrixes[[i]][initial_index:(initial_index + n - d[i] - j)] + sorted_distances_col <- sort.int(distances_col, index.return = TRUE) + + for (k_index in 1:ks) { + k_value <- k[k_index] + + # Get the indexes of the k nearest 'elements', these are called neighbors + k_nn <- utils::head(sorted_distances_col$ix, k_value) + + # Calculate the weights for the future computation of the weighted mean + weights <- switch(weight, + proximity = 1 / (distances_col[k_nn] + .Machine$double.xmin * 1e150), + same = rep.int(1, k_value), + linear = k_value:1) + + # Calculate the predicted value + predictions[k_index] <- stats::weighted.mean(y[n - j + 2 - k_nn, v], weights) + } + + predictions + } + + foreach::registerDoSEQ() + parallel::stopCluster(clust) + + # Calculate error values between the known values and the predicted values, these values + # correspond to instants init to n - 1. These is done for all k's and d's analyzed + for (i in 1:ds) { + initial_index <- (i - 1) * (n - init) + 1 + for (k_index in 1:ks) { + errors[k_index, i] <- forecast::accuracy(stats::ts(all_predictions[k_index, initial_index:(initial_index + n - init - 1)]), real_values)[error_type] + } + } + + # Construction of the list to be returned + index_min_error <- which.min(errors) + opt_k <- k[((index_min_error - 1) %% ks) + 1] + opt_d <- d[ceiling(index_min_error / ks)] + result <- list(errors = errors, k = opt_k, d = opt_d) + + result +} diff --git a/R/knn_optim_parallel2.R b/R/knn_optim_parallel2.R new file mode 100644 index 0000000..72cd99c --- /dev/null +++ b/R/knn_optim_parallel2.R @@ -0,0 +1,145 @@ +#' Parallel k and d optimization +#' +#' Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one +#' are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to +#' instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1 +#' to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +#' is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and +#' the real values of the series. +#' This version of the optimization function uses a parallelized distances calculation function, and the computation of +#' the predicted values is done parallelizing by the number of d's. +#' +#' @param y A time series. +#' @param k Values of k's to be analyzed. +#' @param d Values of d's to be analyzed. +#' @param v Variable to be predicted if given multivariate time series. +#' @param init Variable that determines the limit of the known past for the first instant predicted. +#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord". +#' @param error_metric Type of metric to evaluate the prediction error. +#' Five metrics supported: +#' \describe{ +#' \item{ME}{Mean Error} +#' \item{RMSE}{Root Mean Squared Error} +#' \item{MAE}{Mean Absolute Error} +#' \item{MPE}{Mean Percentage Error} +#' \item{MAPE}{Mean Absolute Percentage Error} +#' } +#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean. +#' Three supported: proximity, same, linear. +#' \describe{ +#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance} +#' \item{same}{all neighbors are assigned with the same weight} +#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the +#' least nearest neighbor which is assigned with a weight of 1.} +#' } +#' @param threads Number of threads to be used when parallelizing, default is number of cores detected - 1 or +#' 1 if there is only one core. +#' @return A matrix of errors, optimal k and d. +#' @examples +#' knn_optim_parallel2(AirPassengers, 1:5, 1:3, threads = 2) +#' knn_optim_parallel2(LakeHuron, 1:10, 1:6, threads = 2) +#' @import foreach +#' @export +knn_optim_parallel2 <- function(y, k, d, v = 1, init = NULL, distance_metric = "euclidean", error_metric = "MAE", weight = "proximity", threads = NULL){ + + # Default number of threads to be used + if (is.null(threads)) { + cores <- parallel::detectCores() + threads <- ifelse(cores == 1, cores, cores - 1) + } + + # Choose the appropiate index of the accuracy result, depending on the error_metric + error_type <- switch(error_metric, + ME = 1, + RMSE = 2, + MAE = 3, + MPE = 4, + MAPE = 5 + ) + + # Sort k or d vector if they are unsorted + if (is.unsorted(k)) { + k <- sort(k) + } + if (is.unsorted(d)) { + d <- sort(d) + } + + # Initialization of variables to be used + y <- matrix(y, ncol = NCOL(y)) + n <- NROW(y) + ks <- length(k) + ds <- length(d) + init <- ifelse(is.null(init), floor(n * 0.7), init) + real_values <- matrix(y[(init + 1):n, v]) + + # This next line is only there to avoid 'No visible binding for global variable' warning + # in R CMD check due to i variable used in foreach loop + i <- NULL + + # For each d an 'elements' matrix and a distances matrix is calculated. Then, with the two inner loops + # all combinations of instants init to n - 1 and k's values are generated in order to predict values + # using k-nn algorithm and calculate errors. + + clust <- parallel::makeCluster(threads) + doParallel::registerDoParallel(cl = clust) + + errors_matrix <- foreach::foreach(i = 1:ds, .combine = cbind, .packages = c("forecast", "parallelDist"), .export = "knn_elements") %dopar% { + predictions <- matrix(nrow = ks, ncol = n - init) + errors <- vector(mode = "numeric", ks) + + # Get 'elements' matrix + elements_matrix <- knn_elements(y, d[i]) + + # Calculate distances between every 'element', a 'triangular matrix' is returned + distances_matrix <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads) + distances_matrix_size <- attr(distances_matrix, "Size") + + for (j in (n - init + 1):2) { + # Get column needed from the distances matrix and sort it + initial_index <- distances_matrix_size * (j - 1) - j * (j - 1) / 2 + 1 + distances_col <- distances_matrix[initial_index:(initial_index + n - d[i] - j)] + sorted_distances_col <- sort.int(distances_col, index.return = TRUE) + + for (k_index in 1:ks) { + k_value <- k[k_index] + + # Get the indexes of the k nearest 'elements', these are called neighbors + k_nn <- utils::head(sorted_distances_col$ix, k_value) + + # Calculate the weights for the future computation of the weighted mean + weights <- switch(weight, + proximity = 1 / (distances_col[k_nn] + .Machine$double.xmin * 1e150), + same = rep.int(1, k_value), + linear = k_value:1) + + # Calculate the predicted value + predictions[k_index, n - init + 2 - j] <- stats::weighted.mean(y[n - j + 2 - k_nn, v], weights) + } + } + + # Calculate error values between the known values and the predicted values, these values + # correspond to instants init to n - 1. This is done for the current d and all k's + for (k_index in 1:ks) { + errors[k_index] <- forecast::accuracy(stats::ts(predictions[k_index, ]), real_values)[error_type] + } + + errors + } + + foreach::registerDoSEQ() + parallel::stopCluster(clust) + + # Construction of the list to be returned + index_min_error <- which.min(errors_matrix) + opt_k <- k[((index_min_error - 1) %% ks) + 1] + opt_d <- d[ceiling(index_min_error / ks)] + dimnames(errors_matrix) <- list(k, d) + result <- list(errors = errors_matrix, k = opt_k, d = opt_d) + + result +} diff --git a/R/knn_optim_parallelf.R b/R/knn_optim_parallelf.R new file mode 100644 index 0000000..ef89b63 --- /dev/null +++ b/R/knn_optim_parallelf.R @@ -0,0 +1,168 @@ +#' Parallel k and d optimization reading from files +#' +#' Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one +#' are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to +#' instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1 +#' to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +#' is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and +#' the real values of the series. +#' This version of the optimization function uses a parallelized distances calculation function, and the computation of +#' the predicted values is done parallelizing by the number of d's and the number of instants to be predicted. Each thread +#' that calculates predicted values reads only the part of the corresponding distances matrix in which the information used +#' to predict is contained. +#' +#' @param y A time series. +#' @param k Values of k;s to be analyzed. +#' @param d Values of d's to be analyzed. +#' @param v Variable to be predicted if given multivariate time series. +#' @param init Variable that determines the limit of the known past for the first instant predicted. +#' @param error_metric Type of metric to evaluate the prediction error. +#' Five metrics supported: +#' \describe{ +#' \item{ME}{Mean Error} +#' \item{RMSE}{Root Mean Squared Error} +#' \item{MAE}{Mean Absolute Error} +#' \item{MPE}{Mean Percentage Error} +#' \item{MAPE}{Mean Absolute Percentage Error} +#' } +#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean. +#' Three supported: proximity, same, linear. +#' \describe{ +#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance} +#' \item{same}{all neighbors are assigned with the same weight} +#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the +#' least nearest neighbor which is assigned with a weight of 1.} +#' } +#' @param threads Number of threads to be used when parallelizing, default is number of cores detected - 1 or +#' 1 if there is only one core. +#' @param file Path and id of the files where the distances matrixes are. +#' @param cols Number of columns per file. +#' @return A matrix of errors, optimal k and d. +#' @examples +#' knn_distances(AirPassengers, 1:3, file = "AirPassengers", cols = 2, threads = 2) +#' knn_optim_parallelf(AirPassengers, 1:5, 1:3, file = "AirPassengers", cols = 2, threads = 2) +#' knn_distances(LakeHuron, 1:6, file = "LakeHuron", cols = 10, threads = 2) +#' knn_optim_parallelf(LakeHuron, 1:10, 1:6, file = "LakeHuron", cols = 10, threads = 2) +#' @import foreach +#' @export +knn_optim_parallelf <- function(y, k, d, v = 1, init = NULL, error_metric = "MAE", weight = "proximity", threads = NULL, file, cols){ + + # Default number of threads to be used + if (is.null(threads)) { + cores <- parallel::detectCores() + threads <- ifelse(cores == 1, cores, cores - 1) + } + + # Choose the appropiate index of the accuracy result, depending on the error_metric + error_type <- switch(error_metric, + ME = 1, + RMSE = 2, + MAE = 3, + MPE = 4, + MAPE = 5 + ) + + # Sort k or d vector if they are unsorted + if (is.unsorted(k)) { + k <- sort(k) + } + if (is.unsorted(d)) { + d <- sort(d) + } + + # Initialization of variables to be used + y <- matrix(y, ncol = NCOL(y)) + n <- NROW(y) + ks <- length(k) + ds <- length(d) + init <- ifelse(is.null(init), floor(n * 0.7), init) + real_values <- matrix(rev(y[(init + 1):n, v])) + errors <- matrix(nrow = ks, ncol = ds, dimnames = list(k, d)) + + # This next line is only there to avoid 'No visible binding for global variable' warning + # in R CMD check due to num_of_file variable used in foreach loop + num_of_file <- NULL + + # For each of the combinations of d's and instants init to n - 1, a distances vector + # according to each combination is taken from a file and then ordered. + # Later, the k's inner loop applies k-nn to predict values. + + clust <- parallel::makeCluster(threads) + doParallel::registerDoParallel(cl = clust) + + # ids of files to be open + if (cols == 1) { + num_of_file_array <- 2:(ceiling((n - init + 1) / cols)) + } + else { + num_of_file_array <- 1:(ceiling((n - init + 1) / cols)) + } + + all_predictions <- foreach::foreach(i = 1:ds, .combine = cbind) %:% foreach::foreach(num_of_file = num_of_file_array, .combine = cbind) %dopar% { + num_cols_in_file <- ifelse(num_of_file * cols > n - init + 1, (n - init + 1) %% cols, cols) + + if (num_of_file == 1 && cols > 1) { + predictions <- matrix(nrow = ks, ncol = num_cols_in_file - 1) + j_in_file_array <- 2:num_cols_in_file + } + else { + predictions <- matrix(nrow = ks, ncol = num_cols_in_file) + j_in_file_array <- 1:num_cols_in_file + } + # Get distances matrix + distances_matrix_size <- n - d[i] - cols * (num_of_file - 1) + 1 + distances_matrix <- readRDS(paste0(file, d[i], "_", num_of_file)) + + for (j_in_file in j_in_file_array) { + # Get column and sort it + initial_index <- distances_matrix_size * (j_in_file - 1) - j_in_file * (j_in_file - 1) / 2 + 1 + distances_col <- distances_matrix[initial_index:(initial_index + distances_matrix_size - j_in_file - 1)] + sorted_distances_col <- sort.int(distances_col, index.return = TRUE) + + for (k_index in 1:ks) { + k_value <- k[k_index] + + # Get the indexes of the k nearest 'elements', these are called neighbors + k_nn <- utils::head(sorted_distances_col$ix, k_value) + + # Calculate the weights for the future computation of the weighted mean + weights <- switch(weight, + proximity = 1 / (distances_col[k_nn] + .Machine$double.xmin * 1e150), + same = rep.int(1, k_value), + linear = k_value:1) + + # Calculate the predicted value + if (num_of_file == 1 && cols > 1) { + predictions[k_index, j_in_file - 1] <- stats::weighted.mean(y[n - (num_of_file - 1) * cols - j_in_file + 2 - k_nn, v], weights) + } + else { + predictions[k_index, j_in_file] <- stats::weighted.mean(y[n - (num_of_file - 1) * cols - j_in_file + 2 - k_nn, v], weights) + + } + + } + } + + predictions + } + + foreach::registerDoSEQ() + parallel::stopCluster(clust) + + # Calculate error values between the known values and the predicted values, these values + # correspond to instants init to n - 1. These is done for all k's and d's analyzed + for (i in 1:ds) { + initial_index <- (i - 1) * (n - init) + 1 + for (k_index in 1:ks) { + errors[k_index, i] <- forecast::accuracy(stats::ts(all_predictions[k_index, initial_index:(initial_index + n - init - 1)]), real_values)[error_type] + } + } + + # Construction of the list to be returned + index_min_error <- which.min(errors) + opt_k <- k[((index_min_error - 1) %% ks) + 1] + opt_d <- d[ceiling(index_min_error / ks)] + result <- list(errors = errors, k = opt_k, d = opt_d) + + result +} diff --git a/R/knn_past.R b/R/knn_past.R new file mode 100644 index 0000000..986464a --- /dev/null +++ b/R/knn_past.R @@ -0,0 +1,76 @@ +#' Past time prediction +#' +#' Predicts values of the time series using k-nearest neighbors algorithm. Values corresponding to instants from init + 1 to +#' the last one are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants +#' from 1 to instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from +#' 1 to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +#' is predicted using instants from 1 to instant n - 1. +#' +#' @param y A time series. +#' @param k Number of neighbors. +#' @param d Length of each of the 'elements'. +#' @param v Variable to be predicted if given multivariate time series. +#' @param init Variable that determines the limit of the known past for the first instant predicted. +#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord". +#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean. +#' Three supported: proximity, same, linear. +#' \describe{ +#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance} +#' \item{same}{all neighbors are assigned with the same weight} +#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the +#' least nearest neighbor which is assigned with a weight of 1.} +#' } +#' @param threads Number of threads to be used when parallelizing, default is number of cores detected - 1 or +#' 1 if there is only one core. +#' @return The predicted values. +#' @examples +#' knn_past(AirPassengers, 5, 2, threads = 2) +#' knn_past(LakeHuron, 3, 6, threads = 2) +#' @export +knn_past <- function(y, k, d, v = 1, init = NULL, distance_metric = "euclidean", weight = "proximity", threads = NULL) { + + # Default number of threads to be used + if (is.null(threads)) { + cores <- parallel::detectCores() + threads <- ifelse(cores == 1, cores, cores - 1) + } + + # Initialization of variables to be used + y <- matrix(y, ncol = NCOL(y)) + n <- NROW(y) + init <- ifelse(is.null(init), init <- floor(n * 0.7), init) + predictions <- array(dim = n - init) + + # Get 'elements' matrix + elements_matrix <- knn_elements(y, d) + + # Calculate distances between every 'element', a 'triangular matrix' is returned + distances_matrix <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads) + distances_size <- attr(distances_matrix, "Size") + + prediction_index <- length(predictions) + for (j in 2:(n - init + 1)) { + # Get column needed from the distances matrix + initial_index <- distances_size * (j - 1) - j * (j - 1) / 2 + 1 + distances_col <- distances_matrix[initial_index:(initial_index + n - d - j)] + + # Get the indexes of the k nearest 'elements', these are called neighbors + k_nn <- utils::head((sort.int(distances_col, index.return = TRUE))$ix, k) + + # Calculate the weights for the future computation of the weighted mean + weights <- switch(weight, + proximity = 1 / (distances_col[k_nn] + .Machine$double.xmin * 1e150), + same = rep.int(1, k), + linear = k:1) + + # Calculate the predicted value + predictions[prediction_index] <- stats::weighted.mean(y[n - j + 2 - k_nn, v], weights) + prediction_index <- prediction_index - 1 + } + + predictions +} diff --git a/man/knn_distances.Rd b/man/knn_distances.Rd new file mode 100644 index 0000000..c602ad0 --- /dev/null +++ b/man/knn_distances.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/knn_distances.R +\name{knn_distances} +\alias{knn_distances} +\title{Distances matrixes computation and saving in files with a maximum of columns} +\usage{ +knn_distances(y, d, distance_metric = "euclidean", threads = NULL, file, + cols = 1) +} +\arguments{ +\item{y}{A time series.} + +\item{d}{Values of d's to be analyzed.} + +\item{distance_metric}{Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".} + +\item{threads}{Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or +1 if there is only one core.} + +\item{file}{Path and id of the files where the distances matrixes will be saved.} + +\item{cols}{Number of columns per file.} +} +\description{ +Calculates one distances matrix per each d for the given time series and then save them in files. Each file will +contain a maximum of 'cols' number of columns from the corresponding distances matrix. +} +\examples{ +knn_distances(AirPassengers, 1:3, threads = 2, file = "AirPassengers", cols = 2) +knn_distances(LakeHuron, 1:6, threads = 2, file = "LakeHuron", cols = 10) +} diff --git a/man/knn_elements.Rd b/man/knn_elements.Rd new file mode 100644 index 0000000..e14ce97 --- /dev/null +++ b/man/knn_elements.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/knn_elements.R +\name{knn_elements} +\alias{knn_elements} +\title{'Elements' matrix computation} +\usage{ +knn_elements(y, d) +} +\arguments{ +\item{y}{A matrix.} + +\item{d}{Length of each of the 'elements'.} +} +\value{ +A matrix to be used for calculating distances. +} +\description{ +Creates a matrix to be used for calculating distances. The most +recent 'element' is put in the first row of the matrix, the +second most recent 'element' in the second row and so on. Therefore, +the oldest 'element' is put in the last row. +} diff --git a/man/knn_next.Rd b/man/knn_next.Rd new file mode 100644 index 0000000..d7674cb --- /dev/null +++ b/man/knn_next.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/knn_next.R +\name{knn_next} +\alias{knn_next} +\title{Next value prediction} +\usage{ +knn_next(y, k, d, v = 1, distance_metric = "euclidean", + weight = "proximity", threads = NULL) +} +\arguments{ +\item{y}{A time series.} + +\item{k}{Number of neighbors.} + +\item{d}{Length of each of the 'elements'.} + +\item{v}{Variable to be predicted if given multivariate time series.} + +\item{distance_metric}{Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".} + +\item{weight}{Type of weight to be used at the time of calculating the predicted value with a weighted mean. +Three supported: proximity, same, linear. +\describe{ + \item{proximity}{the weight assigned to each neighbor is proportional to its distance} + \item{same}{all neighbors are assigned with the same weight} + \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the + least nearest neighbor which is assigned with a weight of 1.} +}} + +\item{threads}{Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or +1 if there is only one core.} +} +\value{ +The predicted value. +} +\description{ +Predicts next value of the time series using k-nearest neighbors algorithm. +} +\examples{ +knn_next(AirPassengers, 5, 2, threads = 2) +knn_next(LakeHuron, 3, 6, threads = 2) +} diff --git a/man/knn_optim.Rd b/man/knn_optim.Rd new file mode 100644 index 0000000..6e65040 --- /dev/null +++ b/man/knn_optim.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/knn_optim.R +\name{knn_optim} +\alias{knn_optim} +\title{k and d optimization} +\usage{ +knn_optim(y, k, d, v = 1, init = NULL, distance_metric = "euclidean", + error_metric = "MAE", weight = "proximity", threads = NULL) +} +\arguments{ +\item{y}{A time series.} + +\item{k}{Values of k's to be analyzed.} + +\item{d}{Values of d's to be analyzed.} + +\item{v}{Variable to be predicted if given multivariate time series.} + +\item{init}{Variable that determines the limit of the known past for the first instant predicted.} + +\item{distance_metric}{Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".} + +\item{error_metric}{Type of metric to evaluate the prediction error. +Five metrics supported: +\describe{ + \item{ME}{Mean Error} + \item{RMSE}{Root Mean Squared Error} + \item{MAE}{Mean Absolute Error} + \item{MPE}{Mean Percentage Error} + \item{MAPE}{Mean Absolute Percentage Error} +}} + +\item{weight}{Type of weight to be used at the time of calculating the predicted value with a weighted mean. +Three supported: proximity, same, linear. +\describe{ + \item{proximity}{the weight assigned to each neighbor is proportional to its distance} + \item{same}{all neighbors are assigned with the same weight} + \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the + least nearest neighbor which is assigned with a weight of 1.} +}} + +\item{threads}{Number of threads to be used when parallelizing, default is number of cores detected - 1 or +1 if there is only one core.} +} +\value{ +A matrix of errors, optimal k and d. +} +\description{ +Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one +are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to +instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1 +to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and +the real values of the series. +This version of the optimization function only uses one thread except for the distances matrixes calculation, for which the +number of threads to be used can be specified. +} +\examples{ +knn_optim(AirPassengers, 1:5, 1:3, threads = 2) +knn_optim(LakeHuron, 1:10, 1:6, threads = 2) +} diff --git a/man/knn_optim_parallel.Rd b/man/knn_optim_parallel.Rd new file mode 100644 index 0000000..510240d --- /dev/null +++ b/man/knn_optim_parallel.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/knn_optim_parallel.R +\name{knn_optim_parallel} +\alias{knn_optim_parallel} +\title{Parallel k and d optimization} +\usage{ +knn_optim_parallel(y, k, d, v = 1, init = NULL, + distance_metric = "euclidean", error_metric = "MAE", + weight = "proximity", threads = NULL) +} +\arguments{ +\item{y}{A time series.} + +\item{k}{Values of k's to be analyzed.} + +\item{d}{Values of d's to be analyzed.} + +\item{v}{Variable to be predicted if given multivariate time series.} + +\item{init}{Variable that determines the limit of the known past for the first instant predicted.} + +\item{distance_metric}{Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".} + +\item{error_metric}{Type of metric to evaluate the prediction error. +Five metrics supported: +\describe{ + \item{ME}{Mean Error} + \item{RMSE}{Root Mean Squared Error} + \item{MAE}{Mean Absolute Error} + \item{MPE}{Mean Percentage Error} + \item{MAPE}{Mean Absolute Percentage Error} +}} + +\item{weight}{Type of weight to be used at the time of calculating the predicted value with a weighted mean. +Three supported: proximity, same, linear. +\describe{ + \item{proximity}{the weight assigned to each neighbor is proportional to its distance} + \item{same}{all neighbors are assigned with the same weight} + \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the + least nearest neighbor which is assigned with a weight of 1.} +}} + +\item{threads}{Number of threads to be used when parallelizing, default is number of cores detected - 1 or +1 if there is only one core.} +} +\value{ +A matrix of errors, optimal k and d. +} +\description{ +Optimizes the values of K and D for a given time series. First, values corresponding to instants from init + 1 to the last one +are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to +instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1 +to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and +the real values of the series. +This version of the optimization function uses a parallelized distances calculation function, and the computation of +the predicted values is done parallelizing by the number of d's and the number of instants to be predicted. +} +\examples{ +knn_optim_parallel(AirPassengers, 1:5, 1:3, threads = 2) +knn_optim_parallel(LakeHuron, 1:10, 1:6, threads = 2) +} diff --git a/man/knn_optim_parallel2.Rd b/man/knn_optim_parallel2.Rd new file mode 100644 index 0000000..d887a2d --- /dev/null +++ b/man/knn_optim_parallel2.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/knn_optim_parallel2.R +\name{knn_optim_parallel2} +\alias{knn_optim_parallel2} +\title{Parallel k and d optimization} +\usage{ +knn_optim_parallel2(y, k, d, v = 1, init = NULL, + distance_metric = "euclidean", error_metric = "MAE", + weight = "proximity", threads = NULL) +} +\arguments{ +\item{y}{A time series.} + +\item{k}{Values of k's to be analyzed.} + +\item{d}{Values of d's to be analyzed.} + +\item{v}{Variable to be predicted if given multivariate time series.} + +\item{init}{Variable that determines the limit of the known past for the first instant predicted.} + +\item{distance_metric}{Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".} + +\item{error_metric}{Type of metric to evaluate the prediction error. +Five metrics supported: +\describe{ + \item{ME}{Mean Error} + \item{RMSE}{Root Mean Squared Error} + \item{MAE}{Mean Absolute Error} + \item{MPE}{Mean Percentage Error} + \item{MAPE}{Mean Absolute Percentage Error} +}} + +\item{weight}{Type of weight to be used at the time of calculating the predicted value with a weighted mean. +Three supported: proximity, same, linear. +\describe{ + \item{proximity}{the weight assigned to each neighbor is proportional to its distance} + \item{same}{all neighbors are assigned with the same weight} + \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the + least nearest neighbor which is assigned with a weight of 1.} +}} + +\item{threads}{Number of threads to be used when parallelizing, default is number of cores detected - 1 or +1 if there is only one core.} +} +\value{ +A matrix of errors, optimal k and d. +} +\description{ +Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one +are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to +instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1 +to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and +the real values of the series. +This version of the optimization function uses a parallelized distances calculation function, and the computation of +the predicted values is done parallelizing by the number of d's. +} +\examples{ +knn_optim_parallel2(AirPassengers, 1:5, 1:3, threads = 2) +knn_optim_parallel2(LakeHuron, 1:10, 1:6, threads = 2) +} diff --git a/man/knn_optim_parallelf.Rd b/man/knn_optim_parallelf.Rd new file mode 100644 index 0000000..6d45693 --- /dev/null +++ b/man/knn_optim_parallelf.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/knn_optim_parallelf.R +\name{knn_optim_parallelf} +\alias{knn_optim_parallelf} +\title{Parallel k and d optimization reading from files} +\usage{ +knn_optim_parallelf(y, k, d, v = 1, init = NULL, error_metric = "MAE", + weight = "proximity", threads = NULL, file, cols) +} +\arguments{ +\item{y}{A time series.} + +\item{k}{Values of k;s to be analyzed.} + +\item{d}{Values of d's to be analyzed.} + +\item{v}{Variable to be predicted if given multivariate time series.} + +\item{init}{Variable that determines the limit of the known past for the first instant predicted.} + +\item{error_metric}{Type of metric to evaluate the prediction error. +Five metrics supported: +\describe{ + \item{ME}{Mean Error} + \item{RMSE}{Root Mean Squared Error} + \item{MAE}{Mean Absolute Error} + \item{MPE}{Mean Percentage Error} + \item{MAPE}{Mean Absolute Percentage Error} +}} + +\item{weight}{Type of weight to be used at the time of calculating the predicted value with a weighted mean. +Three supported: proximity, same, linear. +\describe{ + \item{proximity}{the weight assigned to each neighbor is proportional to its distance} + \item{same}{all neighbors are assigned with the same weight} + \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the + least nearest neighbor which is assigned with a weight of 1.} +}} + +\item{threads}{Number of threads to be used when parallelizing, default is number of cores detected - 1 or +1 if there is only one core.} + +\item{file}{Path and id of the files where the distances matrixes are.} + +\item{cols}{Number of columns per file.} +} +\value{ +A matrix of errors, optimal k and d. +} +\description{ +Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one +are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to +instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1 +to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and +the real values of the series. +This version of the optimization function uses a parallelized distances calculation function, and the computation of +the predicted values is done parallelizing by the number of d's and the number of instants to be predicted. Each thread +that calculates predicted values reads only the part of the corresponding distances matrix in which the information used +to predict is contained. +} +\examples{ +knn_distances(AirPassengers, 1:3, file = "AirPassengers", cols = 2, threads = 2) +knn_optim_parallelf(AirPassengers, 1:5, 1:3, file = "AirPassengers", cols = 2, threads = 2) +knn_distances(LakeHuron, 1:6, file = "LakeHuron", cols = 10, threads = 2) +knn_optim_parallelf(LakeHuron, 1:10, 1:6, file = "LakeHuron", cols = 10, threads = 2) +} diff --git a/man/knn_past.Rd b/man/knn_past.Rd new file mode 100644 index 0000000..6e7bbac --- /dev/null +++ b/man/knn_past.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/knn_past.R +\name{knn_past} +\alias{knn_past} +\title{Past time prediction} +\usage{ +knn_past(y, k, d, v = 1, init = NULL, distance_metric = "euclidean", + weight = "proximity", threads = NULL) +} +\arguments{ +\item{y}{A time series.} + +\item{k}{Number of neighbors.} + +\item{d}{Length of each of the 'elements'.} + +\item{v}{Variable to be predicted if given multivariate time series.} + +\item{init}{Variable that determines the limit of the known past for the first instant predicted.} + +\item{distance_metric}{Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, +dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' +argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. +Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. +Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".} + +\item{weight}{Type of weight to be used at the time of calculating the predicted value with a weighted mean. +Three supported: proximity, same, linear. +\describe{ + \item{proximity}{the weight assigned to each neighbor is proportional to its distance} + \item{same}{all neighbors are assigned with the same weight} + \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the + least nearest neighbor which is assigned with a weight of 1.} +}} + +\item{threads}{Number of threads to be used when parallelizing, default is number of cores detected - 1 or +1 if there is only one core.} +} +\value{ +The predicted values. +} +\description{ +Predicts values of the time series using k-nearest neighbors algorithm. Values corresponding to instants from init + 1 to +the last one are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants +from 1 to instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from +1 to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), +is predicted using instants from 1 to instant n - 1. +} +\examples{ +knn_past(AirPassengers, 5, 2, threads = 2) +knn_past(LakeHuron, 3, 6, threads = 2) +}