Added support for exogenous variables and minor improvements to core …

…functions.
Nixtla · Dec 13, 2023 · 0a09e9c · 0a09e9c
1 parent 85d8c84
commit 0a09e9c
Show file tree

Hide file tree

Showing 9 changed files with 138 additions and 45 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(.validate_exogenous)
 export(date_conversion)
 export(infer_frequency)
 export(nixtla_set_token)

diff --git a/R/timegpt_anomaly_detection.R b/R/timegpt_anomaly_detection.R
@@ -15,12 +15,17 @@
 timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", level=c(99), clean_ex_first=TRUE, model="timegpt-1"){
 
   # Prepare data ----
-  url_anomaly <- "https://dashboard.nixtla.io/api/timegpt_multi_series_anomalies"
+  names(df)[which(names(df) == time_col)] <- "ds"
+  names(df)[which(names(df) == target_col)] <- "y"
+
   if(is.null(id_col)){
     # create unique_id for single series
     df <- df |>
-      dplyr::mutate(unique_id = "id") |>
+      dplyr::mutate(unique_id = "ts_0") |>
       dplyr::select(c("unique_id", tidyselect::everything()))
+  }else{
+    # id_col is not NULL
+    names(df)[which(names(df) == id_col)] <- "unique_id"
   }
 
   data <- .timegpt_data_prep(df, freq, id_col, time_col, target_col)
@@ -34,11 +39,10 @@ timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds",
     clean_ex_first = clean_ex_first
   )
 
-  names(df)[which(names(df) == time_col)] <- "ds"
-  names(df)[which(names(df) == target_col)] <- "y"
-  if(any(!(names(df) %in% c("unique_id", "ds", "y")))){
-    exogenous <- df |>
-      dplyr::select(-y)
+  if(!any(names(df) %in% c("unique_id", "ds", "y"))){
+    # input includes exogenous variables
+    exogenous <-  df |>
+      dplyr::select(-c(.data$y))
 
     x <- list(
       columns = names(exogenous),
@@ -55,6 +59,7 @@ timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds",
   timegpt_data[["level"]] <- level
 
   # Make request ----
+  url_anomaly <- "https://dashboard.nixtla.io/api/timegpt_multi_series_anomalies"
   resp_anomaly <- httr2::request(url_anomaly) |>
     httr2::req_headers(
       "accept" = "application/json",
@@ -104,7 +109,7 @@ timegpt_anomaly_detection <- function(df, freq=NULL, id_col=NULL, time_col="ds",
   }else{
     # remove unique_id column
     res <- res |>
-      dplyr::select(-unique_id)
+      dplyr::select(-c(.data$unique_id))
   }
 
   return(res)

diff --git a/R/timegpt_cross_validation.R b/R/timegpt_cross_validation.R
@@ -20,11 +20,16 @@
 timegpt_cross_validation <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", target_col="y", X_df=NULL, level=NULL, n_windows=1, step_size=NULL, finetune_steps=0, clean_ex_first=TRUE, model="timegpt-1"){
 
   # Prepare data ----
-  url_cv <- "https://dashboard.nixtla.io/api/timegpt_multi_series_cross_validation"
+  names(df)[which(names(df) == time_col)] <- "ds"
+  names(df)[which(names(df) == target_col)] <- "y"
+
   if(is.null(id_col)){
     df <- df |>
-      dplyr::mutate(unique_id = "id") |>
+      dplyr::mutate(unique_id = "ts_0") |>
       dplyr::select(c("unique_id", tidyselect::everything()))
+  }else{
+    # id_col is not NULL
+    names(df)[which(names(df) == id_col)] <- "unique_id"
   }
 
   data <- .timegpt_data_prep(df, freq, id_col, time_col, target_col)
@@ -48,11 +53,20 @@ timegpt_cross_validation <- function(df, h=8, freq=NULL, id_col=NULL, time_col="
 
   if(!is.null(X_df)){
     names(X_df)[which(names(X_df) == time_col)] <- "ds"
-    names(X_df)[which(names(X_df) == target_col)] <- "y"
-    if(!is.null(id_col)){
+    if(is.null(id_col)){
+      X_df <- X_df |>
+        dplyr::mutate(unique_id = "ts_0") |>
+        dplyr::select(c("unique_id", tidyselect::everything()))
+    }else{
       names(X_df)[which(names(X_df) == id_col)] <- "unique_id"
     }
 
+    # Validation checks for exogenous variables
+    status <- .validate_exogenous(df, h, X_df)
+    if(!status$validation){
+      stop(print(status$message))
+    }
+
     exogenous <-  df |>
       dplyr::select(-y)
 
@@ -72,6 +86,7 @@ timegpt_cross_validation <- function(df, h=8, freq=NULL, id_col=NULL, time_col="
   }
 
   # Make request ----
+  url_cv <- "https://dashboard.nixtla.io/api/timegpt_multi_series_cross_validation"
   resp_cv <- httr2::request(url_cv) |>
     httr2::req_headers(
       "accept" = "application/json",
@@ -133,7 +148,7 @@ timegpt_cross_validation <- function(df, h=8, freq=NULL, id_col=NULL, time_col="
   }else{
     # remove unique_id column
     res <- res |>
-      dplyr::select(-unique_id)
+      dplyr::select(-c(.data$unique_id))
   }
 
   return(res)

diff --git a/R/timegpt_data_prep.R b/R/timegpt_data_prep.R
@@ -1,11 +1,11 @@
 #' Prepares data for TimeGPT's API
-#' This is a private function of the package
+#' This is a private function of nixtlar
 #'
 #' @param df A tsibble or a data frame with time series data.
 #' @param freq Frequency of the data.
-#' @param id_col Column that identifies each series.
-#' @param time_col Column that identifies each timestep.
-#' @param target_col Column that contains the target variable.
+#' @param id_col Column that identifies each series. Should be named unique_id.
+#' @param time_col Column that identifies each timestep. Should be named ds.
+#' @param target_col Column that contains the target variable. Should be named y.
 #'
 #' @return A list with the given or inferred frequency, the prepared data, and the original data frame renamed.
 #'
@@ -15,13 +15,6 @@
     stop("Only tsibbles or data frames are allowed.")
   }
 
-  # Rename columns
-  names(df)[which(names(df) == time_col)] <- "ds"
-  names(df)[which(names(df) == target_col)] <- "y"
-  if(!is.null(id_col)){
-    names(df)[which(names(df) == id_col)] <- "unique_id"
-  }
-
   # If df is a tsibble, convert dates to strings and infer frequency if necessary
   if(tsibble::is_tsibble(df)){
     res <- date_conversion(df)
@@ -35,10 +28,11 @@
   }
 
   # Prepare data
-  df <- df[,c("unique_id", "ds", "y")]
+  filtered_df <- df[,c("unique_id", "ds", "y")]
+
   y <- list(
-    columns = names(df),
-    data = lapply(1:nrow(df), function(i) as.list(df[i,]))
+    columns = names(filtered_df),
+    data = lapply(1:nrow(filtered_df), function(i) as.list(filtered_df[i,]))
     )
 
   res <- list(freq = freq,

diff --git a/R/timegpt_forecast.R b/R/timegpt_forecast.R
@@ -19,13 +19,17 @@
 timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", target_col="y", X_df=NULL, level=NULL, finetune_steps=0, clean_ex_first=TRUE, add_history=FALSE, model="timegpt-1"){
 
   # Prepare data ----
-  url <- "https://dashboard.nixtla.io/api/timegpt_multi_series"
+  names(df)[which(names(df) == time_col)] <- "ds"
+  names(df)[which(names(df) == target_col)] <- "y"
 
   if(is.null(id_col)){
     # create unique_id for single series
     df <- df |>
-      dplyr::mutate(unique_id = "id") |>
+      dplyr::mutate(unique_id = "ts_0") |>
       dplyr::select(c("unique_id", tidyselect::everything()))
+  }else{
+    # id_col is not NULL
+    names(df)[which(names(df) == id_col)] <- "unique_id"
   }
 
   data <- .timegpt_data_prep(df, freq, id_col, time_col, target_col)
@@ -43,10 +47,20 @@ timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", tar
 
   if(!is.null(X_df)){
     names(X_df)[which(names(X_df) == time_col)] <- "ds"
-    if(!is.null(id_col)){
+    if(is.null(id_col)){
+      X_df <- X_df |>
+        dplyr::mutate(unique_id = "ts_0") |>
+        dplyr::select(c("unique_id", tidyselect::everything()))
+    }else{
       names(X_df)[which(names(X_df) == id_col)] <- "unique_id"
     }
 
+    # Validation checks for exogenous variables
+    status <- .validate_exogenous(df, h, X_df)
+    if(!status$validation){
+      stop(print(status$message))
+    }
+
     exogenous <-  df |>
       dplyr::select(-y)
 
@@ -66,6 +80,7 @@ timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", tar
   }
 
   # Make request ----
+  url <- "https://dashboard.nixtla.io/api/timegpt_multi_series"
   resp <- httr2::request(url) |>
     httr2::req_headers(
       "accept" = "application/json",
@@ -118,7 +133,7 @@ timegpt_forecast <- function(df, h=8, freq=NULL, id_col=NULL, time_col="ds", tar
   }else{
     # remove unique_id column
     fcst <- fcst |>
-      dplyr::select(-unique_id)
+      dplyr::select(-c(.data$unique_id))
   }
 
   # Generate fitted values ----

diff --git a/R/timegpt_historic.R b/R/timegpt_historic.R
@@ -15,13 +15,17 @@
 timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_col="y", level=NULL, finetune_steps=0, clean_ex_first=TRUE){
 
   # Prepare data ----
-  url_historic <- "https://dashboard.nixtla.io/api/timegpt_multi_series_historic"
+  names(df)[which(names(df) == time_col)] <- "ds"
+  names(df)[which(names(df) == target_col)] <- "y"
 
   if(is.null(id_col)){
     # create unique_id for single series
     df <- df |>
-      dplyr::mutate(unique_id = "id") |>
+      dplyr::mutate(unique_id = "ts_0") |>
       dplyr::select(c("unique_id", tidyselect::everything()))
+  }else{
+    # id_col is not NULL
+    names(df)[which(names(df) == id_col)] <- "unique_id"
   }
 
   data <- .timegpt_data_prep(df, freq, id_col, time_col, target_col)
@@ -35,12 +39,10 @@ timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_c
     clean_ex_first = clean_ex_first
   )
 
-  names(df)[which(names(df) == time_col)] <- "ds"
-  names(df)[which(names(df) == target_col)] <- "y"
-
-  if(any(!(names(df) %in% c("unique_id", "ds", "y")))){
-    exogenous <- df |>
-      dplyr::select(-y)
+  if(!any(names(df) %in% c("unique_id", "ds", "y"))){
+    # input includes exogenous variables
+    exogenous <-  df |>
+      dplyr::select(-c(.data$y))
 
     x <- list(
       columns = names(exogenous),
@@ -56,6 +58,7 @@ timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_c
   }
 
   # Make request ----
+  url_historic <- "https://dashboard.nixtla.io/api/timegpt_multi_series_historic"
   resp_hist <- httr2::request(url_historic) |>
     httr2::req_headers(
       "accept" = "application/json",
@@ -105,7 +108,7 @@ timegpt_historic <- function(df, freq=NULL, id_col=NULL, time_col="ds", target_c
   }else{
     # remove unique_id column
     fitted <- fitted |>
-      dplyr::select(-unique_id)
+      dplyr::select(-c(.data$unique_id))
   }
 
   return(fitted)

diff --git a/R/validate_exogenous.R b/R/validate_exogenous.R
@@ -0,0 +1,37 @@
+#' Validate exogenous variables (if applicable)
+#' This is a private function of nixtlar
+#'
+#' @param df A tsibble or a data frame with time series data.
+#' @param h Forecast horizon.
+#' @param X_df A tsibble or a data frame with future exogenous variables.
+#'
+#' @return A list with the result of the validation (TRUE/FALSE) and an error message (if applicable)
+#' @export
+#'
+.validate_exogenous <- function(df, h, X_df){
+
+  status <- list(validation = TRUE,
+              message = NULL
+              )
+
+  # Check if df and X_df contain the same exogenous variables
+  vals_df <- setdiff(names(df), c("unique_id", "ds", "y"))
+  vals_X_df <- setdiff(names(X_df), c("unique_id", "ds"))
+
+  if(!setequal(vals_df, vals_X_df)){
+    status$valdiation <- FALSE
+    status$message <- "df and X_df must contain the same exogenous variables."
+  }
+
+  # Check if the future values of the exogenous variables cover the forecast horizon
+  future_vals <- X_df |>
+    dplyr::group_by(.data$unique_id) |>
+    dplyr::filter(dplyr::n() == h)
+
+  if(length(unique(future_vals$unique_id)) != length(unique(X_df$unique_id))){
+    status$validation <- FALSE
+    status$message <- "The future values of the exogenous variables must cover the forecast horizon"
+  }
+
+  return(status)
+}
diff --git a/man/dot-timegpt_data_prep.Rd b/man/dot-timegpt_data_prep.Rd
diff --git a/man/dot-validate_exogenous.Rd b/man/dot-validate_exogenous.Rd