From fc857acb7d999c1f0246922d1cf704c5f3b9f0a0 Mon Sep 17 00:00:00 2001 From: MMenchero Date: Mon, 30 Oct 2023 18:20:32 -0600 Subject: [PATCH] WIP: Added better support for tsibbles and improved function to infer frequency. --- DESCRIPTION | 1 + NAMESPACE | 8 ++++-- R/date_conversion.R | 40 +++++++++++++++++++++++++++ R/find_frequency.R | 48 -------------------------------- R/infer_frequency.R | 53 ++++++++++++++++++++++++++++++++++++ R/nixtlaR-package.R | 2 ++ R/prepare_data.R | 22 +++++++++++++++ R/prepare_multi_series.R | 30 -------------------- R/prepare_single_series.R | 28 ------------------- R/timegpt_forecast.R | 53 ++++++++++++++++++++++++++++-------- man/date_conversion.Rd | 17 ++++++++++++ man/find_frequency.Rd | 19 ------------- man/infer_frequency.Rd | 17 ++++++++++++ man/prepare_data.Rd | 17 ++++++++++++ man/prepare_multi_series.Rd | 23 ---------------- man/prepare_single_series.Rd | 21 -------------- man/timegpt_forecast.Rd | 5 +++- 17 files changed, 220 insertions(+), 184 deletions(-) create mode 100644 R/date_conversion.R delete mode 100644 R/find_frequency.R create mode 100644 R/infer_frequency.R create mode 100644 R/prepare_data.R delete mode 100644 R/prepare_multi_series.R delete mode 100644 R/prepare_single_series.R create mode 100644 man/date_conversion.Rd delete mode 100644 man/find_frequency.Rd create mode 100644 man/infer_frequency.Rd create mode 100644 man/prepare_data.Rd delete mode 100644 man/prepare_multi_series.Rd delete mode 100644 man/prepare_single_series.Rd diff --git a/DESCRIPTION b/DESCRIPTION index db0366f..b946d58 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,6 +15,7 @@ Depends: LazyData: true Imports: httr2, + lubridate, tsibble Suggests: httptest2, diff --git a/NAMESPACE b/NAMESPACE index 9e1395f..20aaa22 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,8 @@ # Generated by roxygen2: do not edit by hand -export(find_frequency) -export(prepare_multi_series) -export(prepare_single_series) +export(date_conversion) +export(infer_frequency) +export(prepare_data) export(set_token) export(timegpt_forecast) export(validate_token) @@ -10,4 +10,6 @@ importFrom(httr2,req_headers) importFrom(httr2,req_perform) importFrom(httr2,request) importFrom(httr2,resp_status) +importFrom(lubridate,ymd) +importFrom(lubridate,ymd_hms) importFrom(tsibble,is_tsibble) diff --git a/R/date_conversion.R b/R/date_conversion.R new file mode 100644 index 0000000..4151a39 --- /dev/null +++ b/R/date_conversion.R @@ -0,0 +1,40 @@ +#' Infer frequency of a tsibble and convert its index to date or string. +#' +#' @param df A tsibble. +#' +#' @return A list with the inferred frequency and df with the new index. +#' @export +#' +date_conversion <- function(df){ + cls <- class(df$ds)[1] + + if(cls == "integer"){ + freq <- "Y" + df$ds <- paste0(df$ds, "-01-01") + + }else if(cls %in% c("yearquarter", "yearmonth", "yearweek")){ + freq <- switch(cls, + yearquarter = "Q", + yearmonth = "MS", + yearweek = "W") + df$ds <- as.Date(df$ds) + df$ds <- as.character(df$ds) + + }else if(cls == "Date"){ + freq <- "D" + + }else if(cls %in% c("POSIXct", "POSIXt")){ + freq <- "H" + + }else{ + freq <- NULL + + } + + if(!is.null(freq)){ + message(paste0("Frequency chosen: ", freq)) + } + + res <- list(df = df, freq = freq) + return(res) +} diff --git a/R/find_frequency.R b/R/find_frequency.R deleted file mode 100644 index 7c92ce0..0000000 --- a/R/find_frequency.R +++ /dev/null @@ -1,48 +0,0 @@ -#' Find frequency of time series data -#' -#' @param df A tsibble or a data frame with time series data. -#' @param time_col Column that contains each time step. -#' -#' @return An alias (character) for the frequency. -#' @export -#' -find_frequency <- function(df, time_col){ - - if(!tsibble::is_tsibble(df) & !is.data.frame(df)){ - stop("Only tsibbles or data frames are allowed.") - } - - idx <- which(colnames(df) == time_col) - colnames(df)[idx] <- "ds" # rename to use tsibble::guess_frequency() - df$ds <- as.Date(df$ds) - - dates <- sort(unique(df$ds)) - dates_diff <- diff(dates) - freq_table <- sort(table(dates_diff)) - mode <- freq_table[length(freq_table)] - freq_num <- as.numeric(names(mode)) - - freq_list = list( - list(alias = "Y", value = c(365,366)), - list(alias = "Q", value = c(91,92)), - list(alias = "MS", value = c(30,31)), - list(alias = "W", value = c(7)), - list(alias = "D", value = c(1)) - ) - - freq <- NA - for(i in 1:length(freq_list)){ - if(freq_num %in% freq_list[i][[1]]$value){ - freq <- freq_list[i][[1]]$alias - } - } - - if(is.na(freq)){ - freq <- 1 - message("I'm not sure about the frequency of the data. Will default to 1. Please provide it if you know it.") - } - - message(paste0("Frequency chosen: ", freq)) - - return(freq) -} diff --git a/R/infer_frequency.R b/R/infer_frequency.R new file mode 100644 index 0000000..017b104 --- /dev/null +++ b/R/infer_frequency.R @@ -0,0 +1,53 @@ +#' Infer frequency of a data frame. +#' +#' @param df A data frame with time series data. +#' +#' @return The inferred frequency. +#' @export +#' +infer_frequency <- function(df){ + + freq <- NA + dates <- sort(unique(df$ds)) + + # Check if it's hourly data + nchrs <- lapply(as.character(dates), nchar) + ntable <- sort(table(unlist(nchrs))) + nmode <- ntable[length(ntable)] + nmode <- as.numeric(names(nmode)) + + if(nmode > 10){ + freq <- "H" # We'll assume hourly data + message("Frequency chosen: H") + return(freq) + } + + # If it's not hourly data, check the time differences in days + ddiff <- diff(as.Date(dates)) + table <- sort(table(ddiff)) + mode <- table[length(table)] + mode <- as.numeric(names(mode)) + + freq_list = list( + list(alias = "Y", value = c(365,366)), + list(alias = "Q", value = c(91,92)), + list(alias = "MS", value = c(30,31)), + list(alias = "W", value = c(7)), + list(alias = "D", value = c(1)) + ) + + for(i in 1:length(freq_list)){ + if(mode %in% freq_list[i][[1]]$value){ + freq <- freq_list[i][[1]]$alias + } + } + + if(is.na(freq)){ + freq <- "D" + message("I'm not sure about the frequency of the data. Will default to daily (D). Please provide it if you know it.") + } + + message(paste0("Frequency chosen: ", freq)) + + return(freq) +} diff --git a/R/nixtlaR-package.R b/R/nixtlaR-package.R index 883670a..2af9823 100644 --- a/R/nixtlaR-package.R +++ b/R/nixtlaR-package.R @@ -6,6 +6,8 @@ #' @importFrom httr2 req_perform #' @importFrom httr2 request #' @importFrom httr2 resp_status +#' @importFrom lubridate ymd +#' @importFrom lubridate ymd_hms #' @importFrom tsibble is_tsibble ## usethis namespace: end NULL diff --git a/R/prepare_data.R b/R/prepare_data.R new file mode 100644 index 0000000..aef5119 --- /dev/null +++ b/R/prepare_data.R @@ -0,0 +1,22 @@ +#' Prepare time series data for TimeGPT's API +#' +#' @param df A tsibble or a data frame with time series data. +#' +#' @return A list with the time series data for TimeGPT's API. +#' @export +#' +prepare_data <- function(df){ + if("unique_id" %in% names(df)){ + df <- df[, c("unique_id", "ds", "y")] + y <- list( + columns = names(df), + data = lapply(1:nrow(df), function(i) as.list(df[i,])) + ) + }else{ + # only "ds" and "y" columns + y <- df$y + names(y) <- df$ds + y <- as.list(y) + } + return(y) +} diff --git a/R/prepare_multi_series.R b/R/prepare_multi_series.R deleted file mode 100644 index aa11a71..0000000 --- a/R/prepare_multi_series.R +++ /dev/null @@ -1,30 +0,0 @@ -#' Prepare multiple time series data for TimeGPT -#' -#' @param df A tsibble or a data frame with multiple time series data. -#' @param id_col Column that identifies each series. -#' @param time_col Column that identifies each timestep. -#' @param target_col Column that contains the target variable. -#' -#' @return A list with the time series data that will be sent to TimeGPT. -#' @export -#' -prepare_multi_series <- function(df, id_col, time_col, target_col){ - - if(!tsibble::is_tsibble(df) & !is.data.frame(df)){ - stop("Only tsibbles or data frames are allowed.") - } - - df <- df[,c(id_col, time_col, target_col)] - colnames(df) <- c("unique_id", "ds", "y") - - if(tsibble::is_tsibble(df)){ - df$ds <- as.Date(df$ds) # this transforms dates from a tsibble - } - - y <- list( - columns = colnames(df), - data = lapply(1:nrow(df), function(i) as.list(df[i,])) - ) - - return(y) -} diff --git a/R/prepare_single_series.R b/R/prepare_single_series.R deleted file mode 100644 index 96b55a4..0000000 --- a/R/prepare_single_series.R +++ /dev/null @@ -1,28 +0,0 @@ -#' Prepare single time series data for TimeGPT -#' -#' @param df A tsibble or a data frame with a single time series data. -#' @param time_col Column that identifies each timestep. -#' @param target_col Column that contains the target variable. -#' -#' @return A list with the time series data that will be sent to TimeGPT. -#' @export -#' -prepare_single_series <- function(df, time_col, target_col){ - - if(!tsibble::is_tsibble(df) & !is.data.frame(df)){ - stop("Only tsibbles or data frames are allowed.") - } - - df <- df[,c(time_col, target_col)] - colnames(df) <- c("ds", "y") - - if(tsibble::is_tsibble(df)){ - df$ds <- as.Date(df$ds) # this transforms dates from a tsibble - } - - y <- df$y - names(y) <- df$ds - y <- as.list(y) - - return(y) -} diff --git a/R/timegpt_forecast.R b/R/timegpt_forecast.R index c2b1cfc..7aed88b 100644 --- a/R/timegpt_forecast.R +++ b/R/timegpt_forecast.R @@ -6,6 +6,7 @@ #' @param id_col Column that identifies each series. #' @param time_col Column that identifies each timestep. #' @param target_col Column that contains the target variable. +#' @param X_df A tsibble or a data frame with future exogenous variables. #' @param level The confidence levels (0-100) for the prediction intervals. #' @param finetune_steps Number of steps used to finetune TimeGPT in the new data. #' @param clean_ex_first Clean exogenous signal before making the forecasts using TimeGPT. @@ -14,7 +15,7 @@ #' @return TimeGPT forecasts for point predictions and probabilistic predictions (if level is not NULL). #' @export #' -timegpt_forecast <- function(df, h, freq=NULL, id_col=NULL, time_col="ds", target_col="y", level=NULL, finetune_steps=0, clean_ex_first=TRUE, add_history=FALSE){ +timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", target_col="y", X_df=NULL, level=NULL, finetune_steps=0, clean_ex_first=TRUE, add_history=FALSE){ token <- get("NIXTLAR_TOKEN", envir = nixtlaR_env) @@ -22,21 +23,35 @@ timegpt_forecast <- function(df, h, freq=NULL, id_col=NULL, time_col="ds", targe stop("Only tsibbles or data frames are allowed.") } + # Rename columns + names(df)[which(names(df) == time_col)] <- "ds" + names(df)[which(names(df) == target_col)] <- "y" + if(!is.null(id_col)){ + names(df)[which(names(df) == id_col)] <- "unique_id" + } + + # If df is a tsibble, convert dates to strings and infer frequency + if(tsibble::is_tsibble(df)){ + res <- date_conversion(df) + df <- res$df + freq <- res$freq + } + + # Infer frequency if not available + if(is.null(freq)){ + freq <- infer_frequency(df) + } + # Check if single or multi-series and prepare data if(is.null(id_col)){ url <- "https://dashboard.nixtla.io/api/timegpt" series_type <- "single" - y <- prepare_single_series(df, time_col, target_col) }else{ url <- "https://dashboard.nixtla.io/api/timegpt_multi_series" series_type <- "multi" - y <- prepare_multi_series(df, id_col, time_col, target_col) } - # Prepare request - if(is.null(freq)){ - freq <- find_frequency(df, time_col) - } + y <- prepare_data(df) timegpt_data <- list( fh = h, @@ -46,6 +61,18 @@ timegpt_forecast <- function(df, h, freq=NULL, id_col=NULL, time_col="ds", targe clean_ex_first = clean_ex_first ) + # if(!is.null(X_df)){ + # names(X_df)[which(names(X_df) == time_col)] <- "ds" + # if(!is.null(id_col)){ + # names(X_df)[which(names(X_df) == id_col)] <- "unique_id" + # } + # x <- list( + # columns = names(X_df), + # data = lapply(1:nrow(X_df), function(i) as.list(X_df[i,])) + # ) + # timegpt_data[["x"]] <- x + # } + if(!is.null(level)){ level <- as.list(level) # TimeGPT requires level to be a list. # Users of the forecast package are used to define the level as a vector. @@ -157,11 +184,15 @@ timegpt_forecast <- function(df, h, freq=NULL, id_col=NULL, time_col="ds", targe } } - # This part needs work # Return a tsibble if the input was a tsibble - #if(tsibble::is_tsibble(df)){ - # fcst <- tsibble::as_tsibble(fcst, key = "unique_id", index = "ds") - #} + if(tsibble::is_tsibble(df)){ + if(freq == "H"){ + fcst$ds <- lubridate::ymd_hms(fcst$ds) + }else{ + fcst$ds <- lubridate::ymd(fcst$ds) + } + fcst <- tsibble::as_tsibble(fcst, key="unique_id", index="ds") + } # Rename columns to original names colnames(fcst)[which(colnames(fcst) == "ds")] <- time_col diff --git a/man/date_conversion.Rd b/man/date_conversion.Rd new file mode 100644 index 0000000..86ac3a9 --- /dev/null +++ b/man/date_conversion.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/date_conversion.R +\name{date_conversion} +\alias{date_conversion} +\title{Infer frequency of a tsibble and convert its index to date or string.} +\usage{ +date_conversion(df) +} +\arguments{ +\item{df}{A tsibble.} +} +\value{ +A list with the inferred frequency and df with the new index. +} +\description{ +Infer frequency of a tsibble and convert its index to date or string. +} diff --git a/man/find_frequency.Rd b/man/find_frequency.Rd deleted file mode 100644 index b0fd927..0000000 --- a/man/find_frequency.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/find_frequency.R -\name{find_frequency} -\alias{find_frequency} -\title{Find frequency of time series data} -\usage{ -find_frequency(df, time_col) -} -\arguments{ -\item{df}{A tsibble or a data frame with time series data.} - -\item{time_col}{Column that contains each time step.} -} -\value{ -An alias (character) for the frequency. -} -\description{ -Find frequency of time series data -} diff --git a/man/infer_frequency.Rd b/man/infer_frequency.Rd new file mode 100644 index 0000000..5ffd0a5 --- /dev/null +++ b/man/infer_frequency.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/infer_frequency.R +\name{infer_frequency} +\alias{infer_frequency} +\title{Infer frequency of a data frame.} +\usage{ +infer_frequency(df) +} +\arguments{ +\item{df}{A data frame with time series data.} +} +\value{ +The inferred frequency. +} +\description{ +Infer frequency of a data frame. +} diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd new file mode 100644 index 0000000..3aecbe8 --- /dev/null +++ b/man/prepare_data.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prepare_data.R +\name{prepare_data} +\alias{prepare_data} +\title{Prepare time series data for TimeGPT's API} +\usage{ +prepare_data(df) +} +\arguments{ +\item{df}{A tsibble or a data frame with time series data.} +} +\value{ +A list with the time series data for TimeGPT's API. +} +\description{ +Prepare time series data for TimeGPT's API +} diff --git a/man/prepare_multi_series.Rd b/man/prepare_multi_series.Rd deleted file mode 100644 index 0c0113e..0000000 --- a/man/prepare_multi_series.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/prepare_multi_series.R -\name{prepare_multi_series} -\alias{prepare_multi_series} -\title{Prepare multiple time series data for TimeGPT} -\usage{ -prepare_multi_series(df, id_col, time_col, target_col) -} -\arguments{ -\item{df}{A tsibble or a data frame with multiple time series data.} - -\item{id_col}{Column that identifies each series.} - -\item{time_col}{Column that identifies each timestep.} - -\item{target_col}{Column that contains the target variable.} -} -\value{ -A list with the time series data that will be sent to TimeGPT. -} -\description{ -Prepare multiple time series data for TimeGPT -} diff --git a/man/prepare_single_series.Rd b/man/prepare_single_series.Rd deleted file mode 100644 index 85d6304..0000000 --- a/man/prepare_single_series.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/prepare_single_series.R -\name{prepare_single_series} -\alias{prepare_single_series} -\title{Prepare single time series data for TimeGPT} -\usage{ -prepare_single_series(df, time_col, target_col) -} -\arguments{ -\item{df}{A tsibble or a data frame with a single time series data.} - -\item{time_col}{Column that identifies each timestep.} - -\item{target_col}{Column that contains the target variable.} -} -\value{ -A list with the time series data that will be sent to TimeGPT. -} -\description{ -Prepare single time series data for TimeGPT -} diff --git a/man/timegpt_forecast.Rd b/man/timegpt_forecast.Rd index 71003aa..552e9e5 100644 --- a/man/timegpt_forecast.Rd +++ b/man/timegpt_forecast.Rd @@ -6,11 +6,12 @@ \usage{ timegpt_forecast( df, - h, + h = 8, freq = NULL, id_col = NULL, time_col = "ds", target_col = "y", + X_df = NULL, level = NULL, finetune_steps = 0, clean_ex_first = TRUE, @@ -30,6 +31,8 @@ timegpt_forecast( \item{target_col}{Column that contains the target variable.} +\item{X_df}{A tsibble or a data frame with future exogenous variables.} + \item{level}{The confidence levels (0-100) for the prediction intervals.} \item{finetune_steps}{Number of steps used to finetune TimeGPT in the new data.}