From fc857acb7d999c1f0246922d1cf704c5f3b9f0a0 Mon Sep 17 00:00:00 2001
From: MMenchero <marianamenchero@gmail.com>
Date: Mon, 30 Oct 2023 18:20:32 -0600
Subject: [PATCH] WIP: Added better support for tsibbles and improved function
 to infer frequency.

---
 DESCRIPTION                  |  1 +
 NAMESPACE                    |  8 ++++--
 R/date_conversion.R          | 40 +++++++++++++++++++++++++++
 R/find_frequency.R           | 48 --------------------------------
 R/infer_frequency.R          | 53 ++++++++++++++++++++++++++++++++++++
 R/nixtlaR-package.R          |  2 ++
 R/prepare_data.R             | 22 +++++++++++++++
 R/prepare_multi_series.R     | 30 --------------------
 R/prepare_single_series.R    | 28 -------------------
 R/timegpt_forecast.R         | 53 ++++++++++++++++++++++++++++--------
 man/date_conversion.Rd       | 17 ++++++++++++
 man/find_frequency.Rd        | 19 -------------
 man/infer_frequency.Rd       | 17 ++++++++++++
 man/prepare_data.Rd          | 17 ++++++++++++
 man/prepare_multi_series.Rd  | 23 ----------------
 man/prepare_single_series.Rd | 21 --------------
 man/timegpt_forecast.Rd      |  5 +++-
 17 files changed, 220 insertions(+), 184 deletions(-)
 create mode 100644 R/date_conversion.R
 delete mode 100644 R/find_frequency.R
 create mode 100644 R/infer_frequency.R
 create mode 100644 R/prepare_data.R
 delete mode 100644 R/prepare_multi_series.R
 delete mode 100644 R/prepare_single_series.R
 create mode 100644 man/date_conversion.Rd
 delete mode 100644 man/find_frequency.Rd
 create mode 100644 man/infer_frequency.Rd
 create mode 100644 man/prepare_data.Rd
 delete mode 100644 man/prepare_multi_series.Rd
 delete mode 100644 man/prepare_single_series.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index db0366f..b946d58 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -15,6 +15,7 @@ Depends:
 LazyData: true
 Imports: 
     httr2,
+    lubridate,
     tsibble
 Suggests: 
     httptest2,
diff --git a/NAMESPACE b/NAMESPACE
index 9e1395f..20aaa22 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,8 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
-export(find_frequency)
-export(prepare_multi_series)
-export(prepare_single_series)
+export(date_conversion)
+export(infer_frequency)
+export(prepare_data)
 export(set_token)
 export(timegpt_forecast)
 export(validate_token)
@@ -10,4 +10,6 @@ importFrom(httr2,req_headers)
 importFrom(httr2,req_perform)
 importFrom(httr2,request)
 importFrom(httr2,resp_status)
+importFrom(lubridate,ymd)
+importFrom(lubridate,ymd_hms)
 importFrom(tsibble,is_tsibble)
diff --git a/R/date_conversion.R b/R/date_conversion.R
new file mode 100644
index 0000000..4151a39
--- /dev/null
+++ b/R/date_conversion.R
@@ -0,0 +1,40 @@
+#' Infer frequency of a tsibble and convert its index to date or string.
+#'
+#' @param df A tsibble.
+#'
+#' @return A list with the inferred frequency and df with the new index.
+#' @export
+#'
+date_conversion <- function(df){
+  cls <- class(df$ds)[1]
+
+  if(cls == "integer"){
+    freq <- "Y"
+    df$ds <- paste0(df$ds, "-01-01")
+
+  }else if(cls %in% c("yearquarter", "yearmonth", "yearweek")){
+    freq <- switch(cls,
+                   yearquarter = "Q",
+                   yearmonth = "MS",
+                   yearweek = "W")
+    df$ds <- as.Date(df$ds)
+    df$ds <- as.character(df$ds)
+
+  }else if(cls == "Date"){
+    freq <- "D"
+
+  }else if(cls %in% c("POSIXct", "POSIXt")){
+    freq <- "H"
+
+  }else{
+    freq <- NULL
+
+  }
+
+  if(!is.null(freq)){
+    message(paste0("Frequency chosen: ", freq))
+  }
+
+  res <- list(df = df, freq = freq)
+  return(res)
+}
diff --git a/R/find_frequency.R b/R/find_frequency.R
deleted file mode 100644
index 7c92ce0..0000000
--- a/R/find_frequency.R
+++ /dev/null
@@ -1,48 +0,0 @@
-#' Find frequency of time series data
-#'
-#' @param df A tsibble or a data frame with time series data.
-#' @param time_col Column that contains each time step.
-#'
-#' @return An alias (character) for the frequency.
-#' @export
-#'
-find_frequency <- function(df, time_col){
-
-  if(!tsibble::is_tsibble(df) & !is.data.frame(df)){
-    stop("Only tsibbles or data frames are allowed.")
-  }
-
-  idx <- which(colnames(df) == time_col)
-  colnames(df)[idx] <- "ds" # rename to use tsibble::guess_frequency()
-  df$ds <- as.Date(df$ds)
-
-  dates <- sort(unique(df$ds))
-  dates_diff <- diff(dates)
-  freq_table <- sort(table(dates_diff))
-  mode <- freq_table[length(freq_table)]
-  freq_num <- as.numeric(names(mode))
-
-  freq_list = list(
-    list(alias = "Y", value = c(365,366)),
-    list(alias = "Q", value = c(91,92)),
-    list(alias = "MS", value = c(30,31)),
-    list(alias = "W", value = c(7)),
-    list(alias = "D", value = c(1))
-  )
-
-  freq <- NA
-  for(i in 1:length(freq_list)){
-    if(freq_num %in% freq_list[i][[1]]$value){
-      freq <- freq_list[i][[1]]$alias
-    }
-  }
-
-  if(is.na(freq)){
-    freq <- 1
-    message("I'm not sure about the frequency of the data. Will default to 1. Please provide it if you know it.")
-  }
-
-  message(paste0("Frequency chosen: ", freq))
-
-  return(freq)
-}
diff --git a/R/infer_frequency.R b/R/infer_frequency.R
new file mode 100644
index 0000000..017b104
--- /dev/null
+++ b/R/infer_frequency.R
@@ -0,0 +1,53 @@
+#' Infer frequency of a data frame.
+#'
+#' @param df A data frame with time series data.
+#'
+#' @return The inferred frequency.
+#' @export
+#'
+infer_frequency <- function(df){
+
+  freq <- NA
+  dates <- sort(unique(df$ds))
+
+  # Check if it's hourly data
+  nchrs <- lapply(as.character(dates), nchar)
+  ntable <- sort(table(unlist(nchrs)))
+  nmode <- ntable[length(ntable)]
+  nmode <- as.numeric(names(nmode))
+
+  if(nmode > 10){
+    freq <- "H" # We'll assume hourly data
+    message("Frequency chosen: H")
+    return(freq)
+  }
+
+  # If it's not hourly data, check the time differences in days
+  ddiff <- diff(as.Date(dates))
+  table <- sort(table(ddiff))
+  mode <- table[length(table)]
+  mode <- as.numeric(names(mode))
+
+  freq_list = list(
+    list(alias = "Y", value = c(365,366)),
+    list(alias = "Q", value = c(91,92)),
+    list(alias = "MS", value = c(30,31)),
+    list(alias = "W", value = c(7)),
+    list(alias = "D", value = c(1))
+  )
+
+  for(i in 1:length(freq_list)){
+    if(mode %in% freq_list[i][[1]]$value){
+      freq <- freq_list[i][[1]]$alias
+    }
+  }
+
+  if(is.na(freq)){
+    freq <- "D"
+    message("I'm not sure about the frequency of the data. Will default to daily (D). Please provide it if you know it.")
+  }
+
+  message(paste0("Frequency chosen: ", freq))
+
+  return(freq)
+}
diff --git a/R/nixtlaR-package.R b/R/nixtlaR-package.R
index 883670a..2af9823 100644
--- a/R/nixtlaR-package.R
+++ b/R/nixtlaR-package.R
@@ -6,6 +6,8 @@
 #' @importFrom httr2 req_perform
 #' @importFrom httr2 request
 #' @importFrom httr2 resp_status
+#' @importFrom lubridate ymd
+#' @importFrom lubridate ymd_hms
 #' @importFrom tsibble is_tsibble
 ## usethis namespace: end
 NULL
diff --git a/R/prepare_data.R b/R/prepare_data.R
new file mode 100644
index 0000000..aef5119
--- /dev/null
+++ b/R/prepare_data.R
@@ -0,0 +1,22 @@
+#' Prepare time series data for TimeGPT's API
+#'
+#' @param df A tsibble or a data frame with time series data.
+#'
+#' @return A list with the time series data for TimeGPT's API.
+#' @export
+#'
+prepare_data <- function(df){
+  if("unique_id" %in% names(df)){
+    df <- df[, c("unique_id", "ds", "y")]
+    y <- list(
+      columns = names(df),
+      data = lapply(1:nrow(df), function(i) as.list(df[i,]))
+      )
+  }else{
+      # only "ds" and "y" columns
+      y <- df$y
+      names(y) <- df$ds
+      y <- as.list(y)
+  }
+  return(y)
+}
diff --git a/R/prepare_multi_series.R b/R/prepare_multi_series.R
deleted file mode 100644
index aa11a71..0000000
--- a/R/prepare_multi_series.R
+++ /dev/null
@@ -1,30 +0,0 @@
-#' Prepare multiple time series data for TimeGPT
-#'
-#' @param df A tsibble or a data frame with multiple time series data.
-#' @param id_col Column that identifies each series.
-#' @param time_col Column that identifies each timestep.
-#' @param target_col Column that contains the target variable.
-#'
-#' @return A list with the time series data that will be sent to TimeGPT.
-#' @export
-#'
-prepare_multi_series <- function(df, id_col, time_col, target_col){
-
-  if(!tsibble::is_tsibble(df) & !is.data.frame(df)){
-    stop("Only tsibbles or data frames are allowed.")
-  }
-
-  df <- df[,c(id_col, time_col, target_col)]
-  colnames(df) <- c("unique_id", "ds", "y")
-
-  if(tsibble::is_tsibble(df)){
-    df$ds <- as.Date(df$ds) # this transforms dates from a tsibble
-  }
-
-  y <- list(
-    columns = colnames(df),
-    data = lapply(1:nrow(df), function(i) as.list(df[i,]))
-    )
-
-  return(y)
-}
diff --git a/R/prepare_single_series.R b/R/prepare_single_series.R
deleted file mode 100644
index 96b55a4..0000000
--- a/R/prepare_single_series.R
+++ /dev/null
@@ -1,28 +0,0 @@
-#' Prepare single time series data for TimeGPT
-#'
-#' @param df A tsibble or a data frame with a single time series data.
-#' @param time_col Column that identifies each timestep.
-#' @param target_col Column that contains the target variable.
-#'
-#' @return A list with the time series data that will be sent to TimeGPT.
-#' @export
-#'
-prepare_single_series <- function(df, time_col, target_col){
-
-  if(!tsibble::is_tsibble(df) & !is.data.frame(df)){
-    stop("Only tsibbles or data frames are allowed.")
-  }
-
-  df <- df[,c(time_col, target_col)]
-  colnames(df) <- c("ds", "y")
-
-  if(tsibble::is_tsibble(df)){
-    df$ds <- as.Date(df$ds) # this transforms dates from a tsibble
-  }
-
-  y <- df$y
-  names(y) <- df$ds
-  y <- as.list(y)
-
-  return(y)
-}
diff --git a/R/timegpt_forecast.R b/R/timegpt_forecast.R
index c2b1cfc..7aed88b 100644
--- a/R/timegpt_forecast.R
+++ b/R/timegpt_forecast.R
@@ -6,6 +6,7 @@
 #' @param id_col Column that identifies each series.
 #' @param time_col Column that identifies each timestep.
 #' @param target_col Column that contains the target variable.
+#' @param X_df A tsibble or a data frame with future exogenous variables.
 #' @param level The confidence levels (0-100) for the prediction intervals.
 #' @param finetune_steps Number of steps used to finetune TimeGPT in the new data.
 #' @param clean_ex_first Clean exogenous signal before making the forecasts using TimeGPT.
@@ -14,7 +15,7 @@
 #' @return TimeGPT forecasts for point predictions and probabilistic predictions (if level is not NULL).
 #' @export
 #'
-timegpt_forecast <- function(df, h, freq=NULL, id_col=NULL, time_col="ds", target_col="y", level=NULL, finetune_steps=0, clean_ex_first=TRUE, add_history=FALSE){
+timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", target_col="y", X_df=NULL, level=NULL, finetune_steps=0, clean_ex_first=TRUE, add_history=FALSE){
 
   token <- get("NIXTLAR_TOKEN", envir = nixtlaR_env)
 
@@ -22,21 +23,35 @@ timegpt_forecast <- function(df, h, freq=NULL, id_col=NULL, time_col="ds", targe
     stop("Only tsibbles or data frames are allowed.")
   }
 
+  # Rename columns
+  names(df)[which(names(df) == time_col)] <- "ds"
+  names(df)[which(names(df) == target_col)] <- "y"
+  if(!is.null(id_col)){
+    names(df)[which(names(df) == id_col)] <- "unique_id"
+  }
+
+  # If df is a tsibble, convert dates to strings and infer frequency
+  if(tsibble::is_tsibble(df)){
+    res <- date_conversion(df)
+    df <- res$df
+    freq <- res$freq
+  }
+
+  # Infer frequency if not available
+  if(is.null(freq)){
+    freq <- infer_frequency(df)
+  }
+
   # Check if single or multi-series and prepare data
   if(is.null(id_col)){
     url <- "https://dashboard.nixtla.io/api/timegpt"
     series_type <- "single"
-    y <- prepare_single_series(df, time_col, target_col)
   }else{
     url <- "https://dashboard.nixtla.io/api/timegpt_multi_series"
     series_type <- "multi"
-    y <- prepare_multi_series(df, id_col, time_col, target_col)
   }
 
-  # Prepare request
-  if(is.null(freq)){
-    freq <- find_frequency(df, time_col)
-  }
+  y <- prepare_data(df)
 
   timegpt_data <- list(
     fh = h,
@@ -46,6 +61,18 @@ timegpt_forecast <- function(df, h, freq=NULL, id_col=NULL, time_col="ds", targe
     clean_ex_first = clean_ex_first
     )
 
+  # if(!is.null(X_df)){
+  #   names(X_df)[which(names(X_df) == time_col)] <- "ds"
+  #   if(!is.null(id_col)){
+  #     names(X_df)[which(names(X_df) == id_col)] <- "unique_id"
+  #   }
+  #   x <- list(
+  #     columns = names(X_df),
+  #     data = lapply(1:nrow(X_df), function(i) as.list(X_df[i,]))
+  #   )
+  #   timegpt_data[["x"]] <- x
+  # }
+
   if(!is.null(level)){
     level <- as.list(level) # TimeGPT requires level to be a list.
     # Users of the forecast package are used to define the level as a vector.
@@ -157,11 +184,15 @@ timegpt_forecast <- function(df, h, freq=NULL, id_col=NULL, time_col="ds", targe
     }
   }
 
-  # This part needs work
   # Return a tsibble if the input was a tsibble
-  #if(tsibble::is_tsibble(df)){
-  #  fcst <- tsibble::as_tsibble(fcst, key = "unique_id", index = "ds")
-  #}
+  if(tsibble::is_tsibble(df)){
+    if(freq == "H"){
+      fcst$ds <- lubridate::ymd_hms(fcst$ds)
+    }else{
+      fcst$ds <- lubridate::ymd(fcst$ds)
+    }
+    fcst <- tsibble::as_tsibble(fcst, key="unique_id", index="ds")
+  }
 
   # Rename columns to original names
   colnames(fcst)[which(colnames(fcst) == "ds")] <- time_col
diff --git a/man/date_conversion.Rd b/man/date_conversion.Rd
new file mode 100644
index 0000000..86ac3a9
--- /dev/null
+++ b/man/date_conversion.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/date_conversion.R
+\name{date_conversion}
+\alias{date_conversion}
+\title{Infer frequency of a tsibble and convert its index to date or string.}
+\usage{
+date_conversion(df)
+}
+\arguments{
+\item{df}{A tsibble.}
+}
+\value{
+A list with the inferred frequency and df with the new index.
+}
+\description{
+Infer frequency of a tsibble and convert its index to date or string.
+}
diff --git a/man/find_frequency.Rd b/man/find_frequency.Rd
deleted file mode 100644
index b0fd927..0000000
--- a/man/find_frequency.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/find_frequency.R
-\name{find_frequency}
-\alias{find_frequency}
-\title{Find frequency of time series data}
-\usage{
-find_frequency(df, time_col)
-}
-\arguments{
-\item{df}{A tsibble or a data frame with time series data.}
-
-\item{time_col}{Column that contains each time step.}
-}
-\value{
-An alias (character) for the frequency.
-}
-\description{
-Find frequency of time series data
-}
diff --git a/man/infer_frequency.Rd b/man/infer_frequency.Rd
new file mode 100644
index 0000000..5ffd0a5
--- /dev/null
+++ b/man/infer_frequency.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/infer_frequency.R
+\name{infer_frequency}
+\alias{infer_frequency}
+\title{Infer frequency of a data frame.}
+\usage{
+infer_frequency(df)
+}
+\arguments{
+\item{df}{A data frame with time series data.}
+}
+\value{
+The inferred frequency.
+}
+\description{
+Infer frequency of a data frame.
+}
diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd
new file mode 100644
index 0000000..3aecbe8
--- /dev/null
+++ b/man/prepare_data.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/prepare_data.R
+\name{prepare_data}
+\alias{prepare_data}
+\title{Prepare time series data for TimeGPT's API}
+\usage{
+prepare_data(df)
+}
+\arguments{
+\item{df}{A tsibble or a data frame with time series data.}
+}
+\value{
+A list with the time series data for TimeGPT's API.
+}
+\description{
+Prepare time series data for TimeGPT's API
+}
diff --git a/man/prepare_multi_series.Rd b/man/prepare_multi_series.Rd
deleted file mode 100644
index 0c0113e..0000000
--- a/man/prepare_multi_series.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/prepare_multi_series.R
-\name{prepare_multi_series}
-\alias{prepare_multi_series}
-\title{Prepare multiple time series data for TimeGPT}
-\usage{
-prepare_multi_series(df, id_col, time_col, target_col)
-}
-\arguments{
-\item{df}{A tsibble or a data frame with multiple time series data.}
-
-\item{id_col}{Column that identifies each series.}
-
-\item{time_col}{Column that identifies each timestep.}
-
-\item{target_col}{Column that contains the target variable.}
-}
-\value{
-A list with the time series data that will be sent to TimeGPT.
-}
-\description{
-Prepare multiple time series data for TimeGPT
-}
diff --git a/man/prepare_single_series.Rd b/man/prepare_single_series.Rd
deleted file mode 100644
index 85d6304..0000000
--- a/man/prepare_single_series.Rd
+++ /dev/null
@@ -1,21 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/prepare_single_series.R
-\name{prepare_single_series}
-\alias{prepare_single_series}
-\title{Prepare single time series data for TimeGPT}
-\usage{
-prepare_single_series(df, time_col, target_col)
-}
-\arguments{
-\item{df}{A tsibble or a data frame with a single time series data.}
-
-\item{time_col}{Column that identifies each timestep.}
-
-\item{target_col}{Column that contains the target variable.}
-}
-\value{
-A list with the time series data that will be sent to TimeGPT.
-}
-\description{
-Prepare single time series data for TimeGPT
-}
diff --git a/man/timegpt_forecast.Rd b/man/timegpt_forecast.Rd
index 71003aa..552e9e5 100644
--- a/man/timegpt_forecast.Rd
+++ b/man/timegpt_forecast.Rd
@@ -6,11 +6,12 @@
 \usage{
 timegpt_forecast(
   df,
-  h,
+  h = 8,
   freq = NULL,
   id_col = NULL,
   time_col = "ds",
   target_col = "y",
+  X_df = NULL,
   level = NULL,
   finetune_steps = 0,
   clean_ex_first = TRUE,
@@ -30,6 +31,8 @@ timegpt_forecast(
 
 \item{target_col}{Column that contains the target variable.}
 
+\item{X_df}{A tsibble or a data frame with future exogenous variables.}
+
 \item{level}{The confidence levels (0-100) for the prediction intervals.}
 
 \item{finetune_steps}{Number of steps used to finetune TimeGPT in the new data.}