Skip to content
This repository has been archived by the owner on Jan 8, 2020. It is now read-only.

Commit

Permalink
Package re-upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Dani-Basta committed Mar 4, 2019
1 parent c42ef80 commit a9da6da
Show file tree
Hide file tree
Showing 20 changed files with 1,295 additions and 0 deletions.
25 changes: 25 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Package: knnp
Version: 1.0.0
Date: 2018-06-18
Title: Time Series Prediction using K-Nearest Neighbors Algorithm
(Parallel)
Authors@R: c(
person("Daniel", "Bastarrica Lacalle", email="[email protected]", role=c("aut")),
person("Javier", "Berdecio Trigueros", email="[email protected]", role=c("aut", "cre"))
)
Depends: R (>= 3.3.3)
Imports: parallelDist, forecast, stats, utils, doParallel, foreach
Description: Two main functionalities are provided. One of them is predicting values with
k-nearest neighbors algorithm and the other is optimizing the parameters k and d of the algorithm.
These are carried out in parallel using multiple threads.
License: AGPL-3
RoxygenNote: 6.0.1
URL: https://github.com/Dani-Basta/TFG
BugReports: https://github.com/Dani-Basta/TFG/issues
NeedsCompilation: no
Packaged: 2018-06-18 04:15:01 UTC; javier
Author: Daniel Bastarrica Lacalle [aut],
Javier Berdecio Trigueros [aut, cre]
Maintainer: Javier Berdecio Trigueros <[email protected]>
Repository: CRAN
Date/Publication: 2018-07-01 15:00:02 UTC
19 changes: 19 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
959faf2367e8343a11d14540d6a79e2a *DESCRIPTION
21a5cd3c7003d8c722ebef3867bdd70c *NAMESPACE
df60a570ff76d200c2f5f10314ac936a *NEWS.md
b70db3525ef00b36e4cfafdbb2451e1f *R/knn_distances.R
afbca240fc22fd11670cbf84420c1875 *R/knn_elements.R
4b4cfde192f48759147fba1567948a83 *R/knn_next.R
0670854a515ec04ff6828a44b6e68f19 *R/knn_optim.R
17fbd8144d41877fb50f3b3f4bb8e0aa *R/knn_optim_parallel.R
1a9c5e30c3a3762e0bdbe47b7b20e892 *R/knn_optim_parallel2.R
3c128712999e376d49fa65a156930a37 *R/knn_optim_parallelf.R
b7698a8076ed266064cbf29599f254a2 *R/knn_past.R
0cdeeae6ebeb075d9a8c2f53db3cc196 *man/knn_distances.Rd
357273707843f33aecf579b18633a465 *man/knn_elements.Rd
03be41ee392eceae25c60cb6db79308f *man/knn_next.Rd
ac69fe9d3c9f59fa326d3d22024ddef1 *man/knn_optim.Rd
dad5b7d9e21d91a5d08ff00de988d902 *man/knn_optim_parallel.Rd
e5e510d3cda1d310ee7607c39368f6ac *man/knn_optim_parallel2.Rd
3220a538d15044e6771cae6885352f44 *man/knn_optim_parallelf.Rd
d5b201bc2047d0ca9c0e1fd1c4326f28 *man/knn_past.Rd
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Generated by roxygen2: do not edit by hand

export(knn_distances)
export(knn_next)
export(knn_optim)
export(knn_optim_parallel)
export(knn_optim_parallel2)
export(knn_optim_parallelf)
export(knn_past)
import(foreach)
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# knnp 1.0.0
* Initial release

62 changes: 62 additions & 0 deletions R/knn_distances.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#' Distances matrixes computation and saving in files with a maximum of columns
#'
#' Calculates one distances matrix per each d for the given time series and then save them in files. Each file will
#' contain a maximum of 'cols' number of columns from the corresponding distances matrix.
#'
#' @param y A time series.
#' @param d Values of d's to be analyzed.
#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan,
#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method'
#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances.
#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}.
#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".
#' @param threads Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or
#' 1 if there is only one core.
#' @param file Path and id of the files where the distances matrixes will be saved.
#' @param cols Number of columns per file.
#' @examples
#' knn_distances(AirPassengers, 1:3, threads = 2, file = "AirPassengers", cols = 2)
#' knn_distances(LakeHuron, 1:6, threads = 2, file = "LakeHuron", cols = 10)
#' @export
knn_distances <- function(y, d, distance_metric = "euclidean", threads = NULL, file, cols = 1){

# Default number of threads to be used
if (is.null(threads)) {
cores <- parallel::detectCores()
threads <- ifelse(cores == 1, cores, cores - 1)
}

# Initialization of variables to be used
y <- matrix(y, ncol = NCOL(y))
n <- NROW(y)

# Calculate one distances matrix for each d, as the distance variates
# with the number of values that characterizes each 'element'. This matrixes
# are saved in files.
for (act_d in d) {
# Get 'elements' matrix
elements_matrix <- knn_elements(y, act_d)

# Calculate distances between every 'element', a 'triangular matrix' is returned
distances <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads)

# Save distances matrix in files that will contain a determined number of columns
i <- 1
num_of_file <- 1
distances_length <- length(distances)
act_column_length <- n - act_d
while (i <= distances_length) {
initial_i <- i
j <- 1
while (j <= cols && i <= distances_length) {
i <- i + act_column_length
act_column_length <- act_column_length - 1
j <- j + 1
}
saveRDS(distances[initial_i:(i - 1)], paste0(file, act_d, "_", num_of_file))
num_of_file <- num_of_file + 1
}

}

}
27 changes: 27 additions & 0 deletions R/knn_elements.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#' 'Elements' matrix computation
#'
#' Creates a matrix to be used for calculating distances. The most
#' recent 'element' is put in the first row of the matrix, the
#' second most recent 'element' in the second row and so on. Therefore,
#' the oldest 'element' is put in the last row.
#'
#' @param y A matrix.
#' @param d Length of each of the 'elements'.
#' @return A matrix to be used for calculating distances.
knn_elements <- function(y, d) {
n <- NROW(y)
m <- NCOL(y)
last_elem <- n - d

# Fill matrix as described above, it is done vertically for efficiency reasons
elements_matrix <- matrix(nrow = last_elem + 1, ncol = d * m)
col <- 1
for (i in 1:m) {
for (j in 1:d) {
elements_matrix[, col] <- rev(y[(j:(j + last_elem)), i])
col <- col + 1
}
}

elements_matrix
}
62 changes: 62 additions & 0 deletions R/knn_next.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#' Next value prediction
#'
#' Predicts next value of the time series using k-nearest neighbors algorithm.
#'
#' @param y A time series.
#' @param k Number of neighbors.
#' @param d Length of each of the 'elements'.
#' @param v Variable to be predicted if given multivariate time series.
#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan,
#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method'
#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances.
#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}.
#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".
#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean.
#' Three supported: proximity, same, linear.
#' \describe{
#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance}
#' \item{same}{all neighbors are assigned with the same weight}
#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the
#' least nearest neighbor which is assigned with a weight of 1.}
#' }
#' @param threads Number of threads to be used when parallelizing distances calculation, default is number of cores detected - 1 or
#' 1 if there is only one core.
#' @return The predicted value.
#' @examples
#' knn_next(AirPassengers, 5, 2, threads = 2)
#' knn_next(LakeHuron, 3, 6, threads = 2)
#' @export
knn_next <- function(y, k, d, v = 1, distance_metric = "euclidean", weight = "proximity", threads = NULL) {

# Default number of threads to be used
if (is.null(threads)) {
cores <- parallel::detectCores()
threads <- ifelse(cores == 1, cores, cores - 1)
}

# Initialization of variables to be used
y <- matrix(y, ncol = NCOL(y))
n <- NROW(y)

# Get 'elements' matrix
elements_matrix <- knn_elements(y, d)

# Calculate distances between every 'element', a 'triangular matrix' is returned
# Only the first column is used because it corresponds to the distances
# between the most recent 'element' and the rest of the 'elements'
distances <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads)[1:(n - d)]

# Get the indexes of the k nearest 'elements', these are called neighbors
k_nn <- utils::head((sort.int(distances, index.return = TRUE))$ix, k)

# Calculate the weights for the future computation of the weighted mean
weights <- switch(weight,
proximity = 1 / (distances[k_nn] + .Machine$double.xmin * 1e150),
same = rep.int(1, k),
linear = k:1)

# Calculate the predicted value
prediction <- stats::weighted.mean(y[n - k_nn + 1, v], weights)

prediction
}
127 changes: 127 additions & 0 deletions R/knn_optim.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#' k and d optimization
#'
#' Optimizes the values of k and d for a given time series. First, values corresponding to instants from init + 1 to the last one
#' are predicted. The first value predicted, which corresponds to instant init + 1, is calculated using instants from 1 to
#' instant init; the second value predicted, which corresponds to instant init + 2, is predicted using instants from 1
#' to instant init + 1; and so on until the last value, which corresponds to instant n (length of the given time series),
#' is predicted using instants from 1 to instant n - 1. Finally, the error is evaluated between the predicted values and
#' the real values of the series.
#' This version of the optimization function only uses one thread except for the distances matrixes calculation, for which the
#' number of threads to be used can be specified.
#'
#' @param y A time series.
#' @param k Values of k's to be analyzed.
#' @param d Values of d's to be analyzed.
#' @param v Variable to be predicted if given multivariate time series.
#' @param init Variable that determines the limit of the known past for the first instant predicted.
#' @param distance_metric Type of metric to evaluate the distance between points. Many metrics are supported: euclidean, manhattan,
#' dynamic time warping, canberra and others. For more information about the supported metrics check the values that 'method'
#' argument of function 'parDist' (from 'parallelDist' package) can take as this is the function used to calculate the distances.
#' Link to the package info: \url{https://cran.r-project.org/package=parallelDist}.
#' Some of the values that this argument can take are "euclidean", "manhattan", "dtw", "canberra", "chord".
#' @param error_metric Type of metric to evaluate the prediction error.
#' Five metrics supported:
#' \describe{
#' \item{ME}{Mean Error}
#' \item{RMSE}{Root Mean Squared Error}
#' \item{MAE}{Mean Absolute Error}
#' \item{MPE}{Mean Percentage Error}
#' \item{MAPE}{Mean Absolute Percentage Error}
#' }
#' @param weight Type of weight to be used at the time of calculating the predicted value with a weighted mean.
#' Three supported: proximity, same, linear.
#' \describe{
#' \item{proximity}{the weight assigned to each neighbor is proportional to its distance}
#' \item{same}{all neighbors are assigned with the same weight}
#' \item{linear}{nearest neighbor is assigned with weight k, second closest neighbor with weight k-1, and so on until the
#' least nearest neighbor which is assigned with a weight of 1.}
#' }
#' @param threads Number of threads to be used when parallelizing, default is number of cores detected - 1 or
#' 1 if there is only one core.
#' @return A matrix of errors, optimal k and d.
#' @examples
#' knn_optim(AirPassengers, 1:5, 1:3, threads = 2)
#' knn_optim(LakeHuron, 1:10, 1:6, threads = 2)
#' @export
knn_optim <- function(y, k, d, v = 1, init = NULL, distance_metric = "euclidean", error_metric = "MAE", weight = "proximity", threads = NULL){

# Default number of threads to be used
if (is.null(threads)) {
cores <- parallel::detectCores()
threads <- ifelse(cores == 1, cores, cores - 1)
}

# Choose the appropiate index of the accuracy result, depending on the error_metric
error_type <- switch(error_metric,
ME = 1,
RMSE = 2,
MAE = 3,
MPE = 4,
MAPE = 5
)

# Sort k or d vector if they are unsorted
if (is.unsorted(k)) {
k <- sort(k)
}
if (is.unsorted(d)) {
d <- sort(d)
}

# Initialization of variables to be used
y <- matrix(y, ncol = NCOL(y))
n <- NROW(y)
ks <- length(k)
ds <- length(d)
init <- ifelse(is.null(init), floor(n * 0.7), init)
real_values <- matrix(y[(init + 1):n, v])
errors <- matrix(nrow = ks, ncol = ds, dimnames = list(k, d))

for (i in 1:ds) {
predictions <- matrix(nrow = ks, ncol = n - init)

# Get 'elements' matrix
elements_matrix <- knn_elements(y, d[i])

# Calculate distances between every 'element', a 'triangular matrix' is returned
distances_matrix <- parallelDist::parDist(elements_matrix, distance_metric, threads = threads)
distances_matrix_size <- attr(distances_matrix, "Size")

for (j in (n - init + 1):2) {
# Get column needed from the distances matrix and sort it
initial_index <- distances_matrix_size * (j - 1) - j * (j - 1) / 2 + 1
distances_col <- distances_matrix[initial_index:(initial_index + n - d[i] - j)]
sorted_distances_col <- sort.int(distances_col, index.return = TRUE)

for (k_index in 1:ks) {
k_value <- k[k_index]

# Get the indexes of the k nearest 'elements', these are called neighbors
k_nn <- utils::head(sorted_distances_col$ix, k_value)

# Calculate the weights for the future computation of the weighted mean
weights <- switch(weight,
proximity = 1 / (distances_col[k_nn] + .Machine$double.xmin * 1e150),
same = rep.int(1, k_value),
linear = k_value:1)

# Calculate the predicted value
predictions[k_index, n - init + 2 - j] <- stats::weighted.mean(y[n - j + 2 - k_nn, v], weights)
}
}

# Calculate error values between the known values and the predicted values, these values
# correspond to instants init to n - 1. This is done for the current d and all k's
for (k_index in 1:ks) {
errors[k_index, i] <- forecast::accuracy(stats::ts(predictions[k_index, ]), real_values)[error_type]
}
}

# Construction of the list to be returned
index_min_error <- which.min(errors)
opt_k <- k[((index_min_error - 1) %% ks) + 1]
opt_d <- d[ceiling(index_min_error / ks)]
result <- list(errors = errors, k = opt_k, d = opt_d)

result
}
Loading

0 comments on commit a9da6da

Please sign in to comment.