This repository has been archived by the owner on Jan 8, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c42ef80
commit a9da6da
Showing
20 changed files
with
1,295 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
Package: knnp | ||
Version: 1.0.0 | ||
Date: 2018-06-18 | ||
Title: Time Series Prediction using K-Nearest Neighbors Algorithm | ||
(Parallel) | ||
Authors@R: c( | ||
person("Daniel", "Bastarrica Lacalle", email="[email protected]", role=c("aut")), | ||
person("Javier", "Berdecio Trigueros", email="[email protected]", role=c("aut", "cre")) | ||
) | ||
Depends: R (>= 3.3.3) | ||
Imports: parallelDist, forecast, stats, utils, doParallel, foreach | ||
Description: Two main functionalities are provided. One of them is predicting values with | ||
k-nearest neighbors algorithm and the other is optimizing the parameters k and d of the algorithm. | ||
These are carried out in parallel using multiple threads. | ||
License: AGPL-3 | ||
RoxygenNote: 6.0.1 | ||
URL: https://github.com/Dani-Basta/TFG | ||
BugReports: https://github.com/Dani-Basta/TFG/issues | ||
NeedsCompilation: no | ||
Packaged: 2018-06-18 04:15:01 UTC; javier | ||
Author: Daniel Bastarrica Lacalle [aut], | ||
Javier Berdecio Trigueros [aut, cre] | ||
Maintainer: Javier Berdecio Trigueros <[email protected]> | ||
Repository: CRAN | ||
Date/Publication: 2018-07-01 15:00:02 UTC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
959faf2367e8343a11d14540d6a79e2a *DESCRIPTION | ||
21a5cd3c7003d8c722ebef3867bdd70c *NAMESPACE | ||
df60a570ff76d200c2f5f10314ac936a *NEWS.md | ||
b70db3525ef00b36e4cfafdbb2451e1f *R/knn_distances.R | ||
afbca240fc22fd11670cbf84420c1875 *R/knn_elements.R | ||
4b4cfde192f48759147fba1567948a83 *R/knn_next.R | ||
0670854a515ec04ff6828a44b6e68f19 *R/knn_optim.R | ||
17fbd8144d41877fb50f3b3f4bb8e0aa *R/knn_optim_parallel.R | ||
1a9c5e30c3a3762e0bdbe47b7b20e892 *R/knn_optim_parallel2.R | ||
3c128712999e376d49fa65a156930a37 *R/knn_optim_parallelf.R | ||
b7698a8076ed266064cbf29599f254a2 *R/knn_past.R | ||
0cdeeae6ebeb075d9a8c2f53db3cc196 *man/knn_distances.Rd | ||
357273707843f33aecf579b18633a465 *man/knn_elements.Rd | ||
03be41ee392eceae25c60cb6db79308f *man/knn_next.Rd | ||
ac69fe9d3c9f59fa326d3d22024ddef1 *man/knn_optim.Rd | ||
dad5b7d9e21d91a5d08ff00de988d902 *man/knn_optim_parallel.Rd | ||
e5e510d3cda1d310ee7607c39368f6ac *man/knn_optim_parallel2.Rd | ||
3220a538d15044e6771cae6885352f44 *man/knn_optim_parallelf.Rd | ||
d5b201bc2047d0ca9c0e1fd1c4326f28 *man/knn_past.Rd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(knn_distances) | ||
export(knn_next) | ||
export(knn_optim) | ||
export(knn_optim_parallel) | ||
export(knn_optim_parallel2) | ||
export(knn_optim_parallelf) | ||
export(knn_past) | ||
import(foreach) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# knnp 1.0.0 | ||
* Initial release | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#' Distances matrixes computation and saving in files with a maximum of columns | ||
#' | ||
#' Calculates one distances matrix per each d for the given time series and then save them in files. Each file will | ||
#' contain a maximum of 'cols' number of columns from the corresponding distances matrix. | ||
#' | ||
#' @param y A time series. | ||
#' @param d Values of d's to be analyzed. | ||
#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, | ||
#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' | ||
#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. | ||
#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. | ||
#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord". | ||
#' @param threads Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or | ||
#' 1 if there is only one core. | ||
#' @param file Path and id of the files where the distances matrixes will be saved. | ||
#' @param cols Number of columns per file. | ||
#' @examples | ||
#' knn_distances(AirPassengers, 1:3, threads = 2, file = "AirPassengers", cols = 2) | ||
#' knn_distances(LakeHuron, 1:6, threads = 2, file = "LakeHuron", cols = 10) | ||
#' @export | ||
knn_distances <- function(y, d, distance_metric = "euclidean", threads = NULL, file, cols = 1){ | ||
|
||
# Default number of threads to be used | ||
if (is.null(threads)) { | ||
cores <- parallel::detectCores() | ||
threads <- ifelse(cores == 1, cores, cores - 1) | ||
} | ||
|
||
# Initialization of variables to be used | ||
y <- matrix(y, ncol = NCOL(y)) | ||
n <- NROW(y) | ||
|
||
# Calculate one distances matrix for each d, as the distance variates | ||
# with the number of values that characterizes each 'element'. This matrixes | ||
# are saved in files. | ||
for (act_d in d) { | ||
# Get 'elements' matrix | ||
elements_matrix <- knn_elements(y, act_d) | ||
|
||
# Calculate distances between every 'element', a 'triangular matrix' is returned | ||
distances <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads) | ||
|
||
# Save distances matrix in files that will contain a determined number of columns | ||
i <- 1 | ||
num_of_file <- 1 | ||
distances_length <- length(distances) | ||
act_column_length <- n - act_d | ||
while (i <= distances_length) { | ||
initial_i <- i | ||
j <- 1 | ||
while (j <= cols && i <= distances_length) { | ||
i <- i + act_column_length | ||
act_column_length <- act_column_length - 1 | ||
j <- j + 1 | ||
} | ||
saveRDS(distances[initial_i:(i - 1)], paste0(file, act_d, "_", num_of_file)) | ||
num_of_file <- num_of_file + 1 | ||
} | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#' 'Elements' matrix computation | ||
#' | ||
#' Creates a matrix to be used for calculating distances. The most | ||
#' recent 'element' is put in the first row of the matrix, the | ||
#' second most recent 'element' in the second row and so on. Therefore, | ||
#' the oldest 'element' is put in the last row. | ||
#' | ||
#' @param y A matrix. | ||
#' @param d Length of each of the 'elements'. | ||
#' @return A matrix to be used for calculating distances. | ||
knn_elements <- function(y, d) { | ||
n <- NROW(y) | ||
m <- NCOL(y) | ||
last_elem <- n - d | ||
|
||
# Fill matrix as described above, it is done vertically for efficiency reasons | ||
elements_matrix <- matrix(nrow = last_elem + 1, ncol = d * m) | ||
col <- 1 | ||
for (i in 1:m) { | ||
for (j in 1:d) { | ||
elements_matrix[, col] <- rev(y[(j:(j + last_elem)), i]) | ||
col <- col + 1 | ||
} | ||
} | ||
|
||
elements_matrix | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#' Next value prediction | ||
#' | ||
#' Predicts next value of the time series using k-nearest neighbors algorithm. | ||
#' | ||
#' @param y A time series. | ||
#' @param k Number of neighbors. | ||
#' @param d Length of each of the 'elements'. | ||
#' @param v Variable to be predicted if given multivariate time series. | ||
#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, | ||
#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' | ||
#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. | ||
#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. | ||
#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord". | ||
#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean. | ||
#' Three supported: proximity, same, linear. | ||
#' \describe{ | ||
#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance} | ||
#' \item{same}{all neighbors are assigned with the same weight} | ||
#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the | ||
#' least nearest neighbor which is assigned with a weight of 1.} | ||
#' } | ||
#' @param threads Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or | ||
#' 1 if there is only one core. | ||
#' @return The predicted value. | ||
#' @examples | ||
#' knn_next(AirPassengers, 5, 2, threads = 2) | ||
#' knn_next(LakeHuron, 3, 6, threads = 2) | ||
#' @export | ||
knn_next <- function(y, k, d, v = 1, distance_metric = "euclidean", weight = "proximity", threads = NULL) { | ||
|
||
# Default number of threads to be used | ||
if (is.null(threads)) { | ||
cores <- parallel::detectCores() | ||
threads <- ifelse(cores == 1, cores, cores - 1) | ||
} | ||
|
||
# Initialization of variables to be used | ||
y <- matrix(y, ncol = NCOL(y)) | ||
n <- NROW(y) | ||
|
||
# Get 'elements' matrix | ||
elements_matrix <- knn_elements(y, d) | ||
|
||
# Calculate distances between every 'element', a 'triangular matrix' is returned | ||
# Only the first column is used because it corresponds to the distances | ||
# between the most recent 'element' and the rest of the 'elements' | ||
distances <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads)[1:(n - d)] | ||
|
||
# Get the indexes of the k nearest 'elements', these are called neighbors | ||
k_nn <- utils::head((sort.int(distances, index.return = TRUE))$ix, k) | ||
|
||
# Calculate the weights for the future computation of the weighted mean | ||
weights <- switch(weight, | ||
proximity = 1 / (distances[k_nn] + .Machine$double.xmin * 1e150), | ||
same = rep.int(1, k), | ||
linear = k:1) | ||
|
||
# Calculate the predicted value | ||
prediction <- stats::weighted.mean(y[n - k_nn + 1, v], weights) | ||
|
||
prediction | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#' k and d optimization | ||
#' | ||
#' Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one | ||
#' are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to | ||
#' instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1 | ||
#' to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series), | ||
#' is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and | ||
#' the real values of the series. | ||
#' This version of the optimization function only uses one thread except for the distances matrixes calculation, for which the | ||
#' number of threads to be used can be specified. | ||
#' | ||
#' @param y A time series. | ||
#' @param k Values of k's to be analyzed. | ||
#' @param d Values of d's to be analyzed. | ||
#' @param v Variable to be predicted if given multivariate time series. | ||
#' @param init Variable that determines the limit of the known past for the first instant predicted. | ||
#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan, | ||
#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method' | ||
#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances. | ||
#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}. | ||
#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord". | ||
#' @param error_metric Type of metric to evaluate the prediction error. | ||
#' Five metrics supported: | ||
#' \describe{ | ||
#' \item{ME}{Mean Error} | ||
#' \item{RMSE}{Root Mean Squared Error} | ||
#' \item{MAE}{Mean Absolute Error} | ||
#' \item{MPE}{Mean Percentage Error} | ||
#' \item{MAPE}{Mean Absolute Percentage Error} | ||
#' } | ||
#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean. | ||
#' Three supported: proximity, same, linear. | ||
#' \describe{ | ||
#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance} | ||
#' \item{same}{all neighbors are assigned with the same weight} | ||
#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the | ||
#' least nearest neighbor which is assigned with a weight of 1.} | ||
#' } | ||
#' @param threads Number of threads to be used when parallelizing, default is number of cores detected - 1 or | ||
#' 1 if there is only one core. | ||
#' @return A matrix of errors, optimal k and d. | ||
#' @examples | ||
#' knn_optim(AirPassengers, 1:5, 1:3, threads = 2) | ||
#' knn_optim(LakeHuron, 1:10, 1:6, threads = 2) | ||
#' @export | ||
knn_optim <- function(y, k, d, v = 1, init = NULL, distance_metric = "euclidean", error_metric = "MAE", weight = "proximity", threads = NULL){ | ||
|
||
# Default number of threads to be used | ||
if (is.null(threads)) { | ||
cores <- parallel::detectCores() | ||
threads <- ifelse(cores == 1, cores, cores - 1) | ||
} | ||
|
||
# Choose the appropiate index of the accuracy result, depending on the error_metric | ||
error_type <- switch(error_metric, | ||
ME = 1, | ||
RMSE = 2, | ||
MAE = 3, | ||
MPE = 4, | ||
MAPE = 5 | ||
) | ||
|
||
# Sort k or d vector if they are unsorted | ||
if (is.unsorted(k)) { | ||
k <- sort(k) | ||
} | ||
if (is.unsorted(d)) { | ||
d <- sort(d) | ||
} | ||
|
||
# Initialization of variables to be used | ||
y <- matrix(y, ncol = NCOL(y)) | ||
n <- NROW(y) | ||
ks <- length(k) | ||
ds <- length(d) | ||
init <- ifelse(is.null(init), floor(n * 0.7), init) | ||
real_values <- matrix(y[(init + 1):n, v]) | ||
errors <- matrix(nrow = ks, ncol = ds, dimnames = list(k, d)) | ||
|
||
for (i in 1:ds) { | ||
predictions <- matrix(nrow = ks, ncol = n - init) | ||
|
||
# Get 'elements' matrix | ||
elements_matrix <- knn_elements(y, d[i]) | ||
|
||
# Calculate distances between every 'element', a 'triangular matrix' is returned | ||
distances_matrix <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads) | ||
distances_matrix_size <- attr(distances_matrix, "Size") | ||
|
||
for (j in (n - init + 1):2) { | ||
# Get column needed from the distances matrix and sort it | ||
initial_index <- distances_matrix_size * (j - 1) - j * (j - 1) / 2 + 1 | ||
distances_col <- distances_matrix[initial_index:(initial_index + n - d[i] - j)] | ||
sorted_distances_col <- sort.int(distances_col, index.return = TRUE) | ||
|
||
for (k_index in 1:ks) { | ||
k_value <- k[k_index] | ||
|
||
# Get the indexes of the k nearest 'elements', these are called neighbors | ||
k_nn <- utils::head(sorted_distances_col$ix, k_value) | ||
|
||
# Calculate the weights for the future computation of the weighted mean | ||
weights <- switch(weight, | ||
proximity = 1 / (distances_col[k_nn] + .Machine$double.xmin * 1e150), | ||
same = rep.int(1, k_value), | ||
linear = k_value:1) | ||
|
||
# Calculate the predicted value | ||
predictions[k_index, n - init + 2 - j] <- stats::weighted.mean(y[n - j + 2 - k_nn, v], weights) | ||
} | ||
} | ||
|
||
# Calculate error values between the known values and the predicted values, these values | ||
# correspond to instants init to n - 1. This is done for the current d and all k's | ||
for (k_index in 1:ks) { | ||
errors[k_index, i] <- forecast::accuracy(stats::ts(predictions[k_index, ]), real_values)[error_type] | ||
} | ||
} | ||
|
||
# Construction of the list to be returned | ||
index_min_error <- which.min(errors) | ||
opt_k <- k[((index_min_error - 1) %% ks) + 1] | ||
opt_d <- d[ceiling(index_min_error / ks)] | ||
result <- list(errors = errors, k = opt_k, d = opt_d) | ||
|
||
result | ||
} |
Oops, something went wrong.