Package re-upload

Dani-Basta · Mar 4, 2019 · a9da6da · a9da6da
1 parent c42ef80
commit a9da6da
Show file tree

Hide file tree

Showing 20 changed files with 1,295 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,25 @@
+Package: knnp
+Version: 1.0.0
+Date: 2018-06-18
+Title: Time Series Prediction using K-Nearest Neighbors Algorithm
+        (Parallel)
+Authors@R: c(
+  person("Daniel", "Bastarrica Lacalle", email="[email protected]", role=c("aut")),
+  person("Javier", "Berdecio Trigueros", email="[email protected]", role=c("aut", "cre"))
+  )
+Depends: R (>= 3.3.3)
+Imports: parallelDist, forecast, stats, utils, doParallel, foreach
+Description: Two main functionalities are provided. One of them is predicting values with 
+    k-nearest neighbors algorithm and the other is optimizing the parameters k and d of the algorithm.
+    These are carried out in parallel using multiple threads.
+License: AGPL-3
+RoxygenNote: 6.0.1
+URL: https://github.com/Dani-Basta/TFG
+BugReports: https://github.com/Dani-Basta/TFG/issues
+NeedsCompilation: no
+Packaged: 2018-06-18 04:15:01 UTC; javier
+Author: Daniel Bastarrica Lacalle [aut],
+  Javier Berdecio Trigueros [aut, cre]
+Maintainer: Javier Berdecio Trigueros <[email protected]>
+Repository: CRAN
+Date/Publication: 2018-07-01 15:00:02 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,19 @@
+959faf2367e8343a11d14540d6a79e2a *DESCRIPTION
+21a5cd3c7003d8c722ebef3867bdd70c *NAMESPACE
+df60a570ff76d200c2f5f10314ac936a *NEWS.md
+b70db3525ef00b36e4cfafdbb2451e1f *R/knn_distances.R
+afbca240fc22fd11670cbf84420c1875 *R/knn_elements.R
+4b4cfde192f48759147fba1567948a83 *R/knn_next.R
+0670854a515ec04ff6828a44b6e68f19 *R/knn_optim.R
+17fbd8144d41877fb50f3b3f4bb8e0aa *R/knn_optim_parallel.R
+1a9c5e30c3a3762e0bdbe47b7b20e892 *R/knn_optim_parallel2.R
+3c128712999e376d49fa65a156930a37 *R/knn_optim_parallelf.R
+b7698a8076ed266064cbf29599f254a2 *R/knn_past.R
+0cdeeae6ebeb075d9a8c2f53db3cc196 *man/knn_distances.Rd
+357273707843f33aecf579b18633a465 *man/knn_elements.Rd
+03be41ee392eceae25c60cb6db79308f *man/knn_next.Rd
+ac69fe9d3c9f59fa326d3d22024ddef1 *man/knn_optim.Rd
+dad5b7d9e21d91a5d08ff00de988d902 *man/knn_optim_parallel.Rd
+e5e510d3cda1d310ee7607c39368f6ac *man/knn_optim_parallel2.Rd
+3220a538d15044e6771cae6885352f44 *man/knn_optim_parallelf.Rd
+d5b201bc2047d0ca9c0e1fd1c4326f28 *man/knn_past.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,10 @@
+# Generated by roxygen2: do not edit by hand
+
+export(knn_distances)
+export(knn_next)
+export(knn_optim)
+export(knn_optim_parallel)
+export(knn_optim_parallel2)
+export(knn_optim_parallelf)
+export(knn_past)
+import(foreach)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# knnp 1.0.0
+* Initial release
+
diff --git a/R/knn_distances.R b/R/knn_distances.R
@@ -0,0 +1,62 @@
+#' Distances matrixes computation and saving in files with a maximum of columns
+#'
+#' Calculates one distances matrix per each d for the given time series and then save them in files. Each file will
+#' contain a maximum of 'cols' number of columns from the corresponding distances matrix.
+#'
+#' @param y A time series.
+#' @param d Values of d's to be analyzed.
+#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan,
+#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method'
+#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances.
+#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}.
+#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".
+#' @param threads Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or
+#' 1 if there is only one core.
+#' @param file Path and id of the files where the distances matrixes will be saved.
+#' @param cols Number of columns per file.
+#' @examples
+#' knn_distances(AirPassengers, 1:3, threads = 2, file = "AirPassengers", cols = 2)
+#' knn_distances(LakeHuron, 1:6, threads = 2, file = "LakeHuron", cols = 10)
+#' @export
+knn_distances <- function(y, d, distance_metric = "euclidean", threads = NULL, file, cols = 1){
+
+  # Default number of threads to be used
+  if (is.null(threads)) {
+    cores <- parallel::detectCores()
+    threads <- ifelse(cores == 1, cores, cores - 1)
+  }
+
+  # Initialization of variables to be used
+  y <- matrix(y, ncol = NCOL(y))
+  n <- NROW(y)
+
+  # Calculate one distances matrix for each d, as the distance variates
+  # with the number of values that characterizes each 'element'. This matrixes
+  # are saved in files.
+  for (act_d in d) {
+    # Get 'elements' matrix
+    elements_matrix <- knn_elements(y, act_d)
+
+    # Calculate distances between every 'element', a 'triangular matrix' is returned
+    distances <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads)
+
+    # Save distances matrix in files that will contain a determined number of columns
+    i <- 1
+    num_of_file <- 1
+    distances_length <- length(distances)
+    act_column_length <- n - act_d
+    while (i <= distances_length) {
+      initial_i <- i
+      j <- 1
+      while (j <= cols && i <= distances_length) {
+        i <- i + act_column_length
+        act_column_length <-  act_column_length - 1
+        j <- j + 1
+      }
+      saveRDS(distances[initial_i:(i - 1)], paste0(file, act_d, "_", num_of_file))
+      num_of_file <- num_of_file + 1
+    }
+
+  }
+
+}
diff --git a/R/knn_elements.R b/R/knn_elements.R
@@ -0,0 +1,27 @@
+#' 'Elements' matrix computation
+#'
+#' Creates a matrix to be used for calculating distances. The most
+#' recent 'element' is put in the first row of the matrix, the
+#' second most recent 'element' in the second row and so on. Therefore,
+#' the oldest 'element' is put in the last row.
+#'
+#' @param y A matrix.
+#' @param d Length of each of the 'elements'.
+#' @return A matrix to be used for calculating distances.
+knn_elements <- function(y, d) {
+  n <- NROW(y)
+  m <- NCOL(y)
+  last_elem <- n - d
+
+  # Fill matrix as described above, it is done vertically for efficiency reasons
+  elements_matrix <- matrix(nrow = last_elem + 1, ncol = d * m)
+  col <- 1
+  for (i in 1:m) {
+    for (j in 1:d) {
+      elements_matrix[, col] <- rev(y[(j:(j + last_elem)), i])
+      col <- col + 1
+    }
+  }
+
+  elements_matrix
+}
diff --git a/R/knn_next.R b/R/knn_next.R
@@ -0,0 +1,62 @@
+#' Next value prediction
+#'
+#' Predicts next value of the time series using k-nearest neighbors algorithm.
+#'
+#' @param y A time series.
+#' @param k Number of neighbors.
+#' @param d Length of each of the 'elements'.
+#' @param v Variable to be predicted if given multivariate time series.
+#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan,
+#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method'
+#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances.
+#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}.
+#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".
+#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean.
+#' Three supported: proximity, same, linear.
+#' \describe{
+#'   \item{proximity}{the weight assigned to each neighbor is proportional to its distance}
+#'   \item{same}{all neighbors are assigned with the same weight}
+#'   \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the
+#'                least nearest neighbor which is assigned with a weight of 1.}
+#' }
+#' @param threads Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or
+#' 1 if there is only one core.
+#' @return The predicted value.
+#' @examples
+#' knn_next(AirPassengers, 5, 2, threads = 2)
+#' knn_next(LakeHuron, 3, 6, threads = 2)
+#' @export
+knn_next <- function(y, k, d, v = 1, distance_metric = "euclidean", weight = "proximity", threads = NULL) {
+
+  # Default number of threads to be used
+  if (is.null(threads)) {
+    cores <- parallel::detectCores()
+    threads <- ifelse(cores == 1, cores, cores - 1)
+  }
+
+  # Initialization of variables to be used
+  y <- matrix(y, ncol = NCOL(y))
+  n <- NROW(y)
+
+  # Get 'elements' matrix
+  elements_matrix <- knn_elements(y, d)
+
+  # Calculate distances between every 'element', a 'triangular matrix' is returned
+  # Only the first column is used because it corresponds to the distances
+  # between the most recent 'element' and the rest of the 'elements'
+  distances <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads)[1:(n - d)]
+
+  # Get the indexes of the k nearest 'elements', these are called neighbors
+  k_nn <- utils::head((sort.int(distances, index.return = TRUE))$ix, k)
+
+  # Calculate the weights for the future computation of the weighted mean
+  weights <- switch(weight, 
+                    proximity = 1 / (distances[k_nn] + .Machine$double.xmin * 1e150),
+                    same = rep.int(1, k),
+                    linear = k:1)
+
+  # Calculate the predicted value
+  prediction <- stats::weighted.mean(y[n - k_nn + 1, v], weights)
+
+  prediction
+}
diff --git a/R/knn_optim.R b/R/knn_optim.R
@@ -0,0 +1,127 @@
+#' k and d optimization
+#'
+#' Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one
+#' are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to
+#' instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1
+#' to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series),
+#' is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and
+#' the real values of the series.
+#' This version of the optimization function only uses one thread except for the distances matrixes calculation, for which the
+#' number of threads to be used can be specified.
+#'
+#' @param y A time series.
+#' @param k Values of k's to be analyzed.
+#' @param d Values of d's to be analyzed.
+#' @param v Variable to be predicted if given multivariate time series.
+#' @param init Variable that determines the limit of the known past for the first instant predicted.
+#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan,
+#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method'
+#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances.
+#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}.
+#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".
+#' @param error_metric Type of metric to evaluate the prediction error.
+#' Five metrics supported:
+#' \describe{
+#'   \item{ME}{Mean Error}
+#'   \item{RMSE}{Root Mean Squared Error}
+#'   \item{MAE}{Mean Absolute Error}
+#'   \item{MPE}{Mean Percentage Error}
+#'   \item{MAPE}{Mean Absolute Percentage Error}
+#' }
+#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean.
+#' Three supported: proximity, same, linear.
+#' \describe{
+#'   \item{proximity}{the weight assigned to each neighbor is proportional to its distance}
+#'   \item{same}{all neighbors are assigned with the same weight}
+#'   \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the
+#'                least nearest neighbor which is assigned with a weight of 1.}
+#' }
+#' @param threads Number of threads to be used when parallelizing, default is number of cores detected - 1 or
+#' 1 if there is only one core.
+#' @return A matrix of errors, optimal k and d.
+#' @examples
+#' knn_optim(AirPassengers, 1:5, 1:3, threads = 2)
+#' knn_optim(LakeHuron, 1:10, 1:6, threads = 2)
+#' @export
+knn_optim <- function(y, k, d, v = 1, init = NULL, distance_metric = "euclidean", error_metric = "MAE", weight = "proximity", threads = NULL){
+
+    # Default number of threads to be used
+    if (is.null(threads)) {
+      cores <- parallel::detectCores()
+      threads <- ifelse(cores == 1, cores, cores - 1)
+    }
+
+    # Choose the appropiate index of the accuracy result, depending on the error_metric
+    error_type <- switch(error_metric,
+                        ME = 1,
+                        RMSE = 2,
+                        MAE = 3,
+                        MPE = 4,
+                        MAPE = 5
+    )
+
+    # Sort k or d vector if they are unsorted
+    if (is.unsorted(k)) {
+      k <- sort(k)
+    }
+    if (is.unsorted(d)) {
+      d <- sort(d)
+    }
+
+    # Initialization of variables to be used
+    y <- matrix(y, ncol = NCOL(y))
+    n <- NROW(y)
+    ks <- length(k)
+    ds <- length(d)
+    init <- ifelse(is.null(init), floor(n * 0.7), init)
+    real_values <- matrix(y[(init + 1):n, v])
+    errors <- matrix(nrow = ks, ncol = ds, dimnames = list(k, d))
+
+    for (i in 1:ds) {
+        predictions <- matrix(nrow = ks, ncol = n - init)
+
+        # Get 'elements' matrix
+        elements_matrix <- knn_elements(y, d[i])
+
+        # Calculate distances between every 'element', a 'triangular matrix' is returned
+        distances_matrix <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads)
+        distances_matrix_size <- attr(distances_matrix, "Size")
+
+        for (j in (n - init + 1):2) {
+            # Get column needed from the distances matrix and sort it
+            initial_index <- distances_matrix_size * (j - 1) - j * (j - 1) / 2 + 1
+            distances_col <- distances_matrix[initial_index:(initial_index + n - d[i] - j)]
+            sorted_distances_col <- sort.int(distances_col, index.return = TRUE)
+
+            for (k_index in 1:ks) {
+              k_value <- k[k_index]
+
+              # Get the indexes of the k nearest 'elements', these are called neighbors
+              k_nn <- utils::head(sorted_distances_col$ix, k_value)
+
+              # Calculate the weights for the future computation of the weighted mean
+              weights <- switch(weight,
+                                proximity = 1 / (distances_col[k_nn] + .Machine$double.xmin * 1e150),
+                                same = rep.int(1, k_value),
+                                linear = k_value:1)
+
+              # Calculate the predicted value
+              predictions[k_index, n - init + 2 - j] <- stats::weighted.mean(y[n - j + 2 - k_nn, v], weights)
+            }
+        }
+
+        # Calculate error values between the known values and the predicted values, these values
+        # correspond to instants init to n - 1. This is done for the current d and all k's
+        for (k_index in 1:ks) {
+          errors[k_index, i] <- forecast::accuracy(stats::ts(predictions[k_index, ]), real_values)[error_type]
+        }
+    }
+
+    # Construction of the list to be returned
+    index_min_error <- which.min(errors)
+    opt_k <- k[((index_min_error - 1) %% ks) + 1]
+    opt_d <- d[ceiling(index_min_error / ks)]
+    result <- list(errors = errors, k = opt_k, d = opt_d)
+
+    result
+}