Merge pull request #27 from Nixtla/v2-issues

fix: release v0.6.0 issues
Nixtla · Oct 9, 2024 · 9c6522c · 9c6522c
2 parents 3cb04e4 + 4c54e8d
commit 9c6522c
Show file tree

Hide file tree

Showing 6 changed files with 79 additions and 27 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: nixtlar
 Title: A Software Development Kit for 'Nixtla''s 'TimeGPT'
-Version: 0.5.4
+Version: 0.6.0
 Authors@R: c(
     person("Mariana", "Menchero", email = "[email protected]", role = c("aut", "cre"), comment = "First author and maintainer"),
     person("Nixtla", role = "cph", comment = "Copyright held by 'Nixtla'")

diff --git a/R/generate_output_dates.R b/R/generate_output_dates.R
@@ -14,28 +14,46 @@
 #'   dates_df <- .generate_output_dates(df_info, freq, h)
 #' }
 #'
-.generate_output_dates <- function(df_info, freq, h) {
-  new_dates <- lapply(1:nrow(df_info), function(i) {
+.generate_output_dates <- function(df_info, freq, h){
+
+  new_dates <- vector("list", nrow(df_info))
+  r_freq <- .r_frequency(freq)
+
+  for(i in 1:nrow(df_info)){
     start_date <- df_info$dates[i]
-    r_freq <- .r_frequency(freq)
 
-    if(freq == "QE") {
-      # End of quarter dates are: "YYY-03-31", "YYYY-06-30", "YYYY-09-30" and "YYYY-12-31".
-      dt <- seq(from = start_date, by = "quarter", length.out = h+1)
+    if(freq %in% c("QE", "Q")){
+      dt <- seq(from = start_date, by = r_freq, length.out = h+1)
       month <- lubridate::month(start_date)
+      dt <- seq(from = start_date, by = "quarter", length.out = h+1)
+
+      # Calendar adjustments
       if (month %in% c(3, 12)) {
-        dt <- ifelse(lubridate::month(dt) %in% c(7, 10), dt - lubridate::days(1), dt)
+        for (j in 1:length(dt)) {
+          mt <- lubridate::month(dt[j])
+          if (mt %in% c(7, 10)) {
+            dt[j] <- dt[j] - lubridate::days(1)
+          }
+        }
       } else {
-        dt <- ifelse(lubridate::month(dt) %in% c(3, 12), dt + lubridate::days(1), dt)
+        # month %in% c(6, 9)
+        for (j in 1:length(dt)) {
+          mt <- lubridate::month(dt[j])
+          if (mt %in% c(3, 12)) {
+            dt[j] <- dt[j] + lubridate::days(1)
+          }
+        }
       }
-    } else if(freq == "ME") {
-      dt <- seq(from = start_date + lubridate::days(1), by = r_freq, length.out = h+1) - lubridate::days(1)
-    } else {
-      dt <- seq(from = start_date, by = r_freq, length.out = h+1)
+
+    }else if(freq %in% c("ME", "M")){
+      start_date <- start_date+lubridate::days(1)
+      dt <- seq(from = start_date, by = r_freq, length.out = h+1)-lubridate::days(1)
+    }else{
+      dt <- seq(df_info$dates[i], by = r_freq, length.out = h+1)
     }
 
-    dt[2:length(dt)]
-  })
+    new_dates[[i]] <- dt[2:length(dt)]
+  }
 
   dates_df <- data.frame(lapply(new_dates, as.POSIXct))
 

diff --git a/R/infer_frequency.R b/R/infer_frequency.R
@@ -16,21 +16,33 @@ infer_frequency <- function(df, freq){
     return(freq)
   }
 
-  num_chars <- nchar(as.character(df$ds[1]))
+  if(length(unique(df$ds)) > 1){ # this is done to avoid the vanishing dates issue
+    dt <- sample(df$ds, 2)
+  }else{
+    dt <- df$ds[1]
+  }
+
+  # Vanishing dates issue: Dates that correspond to midnight only show YYYY-MM-DD, excluding 00:00:00
+
+  num_chars <- max(nchar(as.character(dt)))
 
   if(num_chars <= 10){
     # assumes dates in format YYYY-MM-DD
-    dates <- lubridate::ymd(sort(unique(df$ds)))
+    if(inherits(df$ds, "character")){
+      dates <- lubridate::ymd(sort(unique(df$ds)))
+    }else{
+      dates <- sort(unique(df$ds))
+    }
     dates_diff <- diff(dates)
     dates_table <- table(dates_diff)
     mode <- as.numeric(names(which.max(dates_table)))
 
     freq_list = list(
       list(alias = "Y", value = c(365,366)),
       list(alias = "Q", value = c(91,92)),
-      list(alias = "MS", value = c(30,31)),
+      list(alias = "M", value = c(30,31)),
       list(alias = "W", value = c(7)),
-      list(alias = "D", value = c(1))
+      list(alias = "D", value = c(24,1))
     )
 
     for(item in freq_list){
@@ -45,7 +57,11 @@ infer_frequency <- function(df, freq){
 
   }else{
     # assumes dates in format YYYY-MM-DD hh:mm:ss
-    dates <- lubridate::ymd_hms(sort(unique(df$ds)))
+    if(inherits(df$ds, "character")){
+      dates <- lubridate::ymd_hms(sort(unique(df$ds)))
+    }else{
+      dates <- sort(unique(df$ds))
+    }
     dates_diff <- diff(dates)
     dates_table <- table(dates_diff)
     mode <- as.numeric(names(which.max(dates_table)))

diff --git a/R/nixtla_client_forecast.R b/R/nixtla_client_forecast.R
@@ -222,18 +222,34 @@ nixtla_client_forecast <- function(df, h=8, freq=NULL, id_col="unique_id", time_
   }
 
   # Add unique ids and dates to forecast ----
-  nch <- nchar(df_info$last_ds[1])
-  if(nch <= 10){
-    df_info$dates <- lubridate::ymd(df_info$last_ds)
+  if(inherits(df_info$last_ds, "character")){
+    if(length(df_info$last_ds) > 1){
+      dt <- sample(df_info$last_ds, 2)
+    }else{
+      dt <- df_info$last_ds[1]
+    }
+    nch <- max(nchar(as.character(dt)))
+    if(nch <= 10){
+      df_info$dates <- lubridate::ymd(df_info$last_ds)
+    }else{
+      df_info$dates <- lubridate::ymd_hms(df_info$last_ds)
+    }
   }else{
-    df_info$dates <- lubridate::ymd_hms(df_info$last_ds)
+    # assumes df_info$last_ds is already a date-object
+    df_info$dates <- df_info$last_ds
   }
 
   dates_df <- .generate_output_dates(df_info, freq, h)
 
   dates_long_df <- dates_df |>
-    tidyr::pivot_longer(cols = everything(), names_to = "unique_id", values_to = "ds") |>
-    dplyr::arrange(.data$unique_id)
+    tidyr::pivot_longer(cols = everything(), names_to = "unique_id", values_to = "ds")
+
+  if(inherits(df$unique_id, "integer")){
+    dates_long_df$unique_id <- as.numeric(dates_long_df$unique_id)
+  }
+
+  dates_long_df <- dates_long_df |>
+      dplyr::arrange(.data$unique_id)
 
   forecast <- cbind(dates_long_df, fc)
 

diff --git a/README.Rmd b/README.Rmd
@@ -41,7 +41,7 @@ We are excited to announce the release of `nixtlar` version 0.6.0, which integra
 
 - **Date Formats**: For efficiency, `nixtlar` now strictly requires dates to be in the format `YYYY-MM-DD` or `YYYY-MM-DD hh:mm:ss`, either as character strings or date-time objects. For more details, please refer to our [Get Started](https://nixtla.github.io/nixtlar/articles/get-started.html) guide and [Data Requirements](https://nixtla.github.io/nixtlar/articles/data-requirements.html) vignette. 
 
-- **Default ID Column**: In alignment with the Python SDK, `nixtlar` now defaults the `id_col` to `unique_id`. This means you no longer need to specify this column if it is already named `unique_id`. If your dataset contains only one series, simply set `id_col=NULL`.
+- **Default ID Column**: In alignment with the Python SDK, `nixtlar` now defaults the `id_col` to `unique_id`. This means you no longer need to specify this column if it is already named `unique_id`. If your dataset contains only one series, simply set `id_col=NULL`. The `id_col` only accepts characters or integers. 
 
 These changes leverage the capabilities of `TimeGPT`'s new API and align `nixtlar` more closely with the Python SDK, ensuring a better user experience. 
 

diff --git a/vignettes/data-requirements.Rmd b/vignettes/data-requirements.Rmd
@@ -60,6 +60,8 @@ head(df)
 str(df)
 ```
 
+The `id_col` only accepts characters or integers. 
+
 ## 3. Exogenous Variables
 
 When using exogenous variables, `nixtlar` differentiates between historical and future exogenous variables: