diff --git a/DESCRIPTION b/DESCRIPTION index 3e513ce..4ff112e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: nixtlar Title: A Software Development Kit for 'Nixtla''s 'TimeGPT' -Version: 0.5.4 +Version: 0.6.0 Authors@R: c( person("Mariana", "Menchero", email = "mariana@nixtla.io", role = c("aut", "cre"), comment = "First author and maintainer"), person("Nixtla", role = "cph", comment = "Copyright held by 'Nixtla'") diff --git a/R/generate_output_dates.R b/R/generate_output_dates.R index 78e02ec..9b38675 100644 --- a/R/generate_output_dates.R +++ b/R/generate_output_dates.R @@ -14,28 +14,46 @@ #' dates_df <- .generate_output_dates(df_info, freq, h) #' } #' -.generate_output_dates <- function(df_info, freq, h) { - new_dates <- lapply(1:nrow(df_info), function(i) { +.generate_output_dates <- function(df_info, freq, h){ + + new_dates <- vector("list", nrow(df_info)) + r_freq <- .r_frequency(freq) + + for(i in 1:nrow(df_info)){ start_date <- df_info$dates[i] - r_freq <- .r_frequency(freq) - if(freq == "QE") { - # End of quarter dates are: "YYY-03-31", "YYYY-06-30", "YYYY-09-30" and "YYYY-12-31". - dt <- seq(from = start_date, by = "quarter", length.out = h+1) + if(freq %in% c("QE", "Q")){ + dt <- seq(from = start_date, by = r_freq, length.out = h+1) month <- lubridate::month(start_date) + dt <- seq(from = start_date, by = "quarter", length.out = h+1) + + # Calendar adjustments if (month %in% c(3, 12)) { - dt <- ifelse(lubridate::month(dt) %in% c(7, 10), dt - lubridate::days(1), dt) + for (j in 1:length(dt)) { + mt <- lubridate::month(dt[j]) + if (mt %in% c(7, 10)) { + dt[j] <- dt[j] - lubridate::days(1) + } + } } else { - dt <- ifelse(lubridate::month(dt) %in% c(3, 12), dt + lubridate::days(1), dt) + # month %in% c(6, 9) + for (j in 1:length(dt)) { + mt <- lubridate::month(dt[j]) + if (mt %in% c(3, 12)) { + dt[j] <- dt[j] + lubridate::days(1) + } + } } - } else if(freq == "ME") { - dt <- seq(from = start_date + lubridate::days(1), by = r_freq, length.out = h+1) - lubridate::days(1) - } else { - dt <- seq(from = start_date, by = r_freq, length.out = h+1) + + }else if(freq %in% c("ME", "M")){ + start_date <- start_date+lubridate::days(1) + dt <- seq(from = start_date, by = r_freq, length.out = h+1)-lubridate::days(1) + }else{ + dt <- seq(df_info$dates[i], by = r_freq, length.out = h+1) } - dt[2:length(dt)] - }) + new_dates[[i]] <- dt[2:length(dt)] + } dates_df <- data.frame(lapply(new_dates, as.POSIXct)) diff --git a/R/infer_frequency.R b/R/infer_frequency.R index bf1ed5a..1372dc5 100644 --- a/R/infer_frequency.R +++ b/R/infer_frequency.R @@ -16,11 +16,23 @@ infer_frequency <- function(df, freq){ return(freq) } - num_chars <- nchar(as.character(df$ds[1])) + if(length(unique(df$ds)) > 1){ # this is done to avoid the vanishing dates issue + dt <- sample(df$ds, 2) + }else{ + dt <- df$ds[1] + } + + # Vanishing dates issue: Dates that correspond to midnight only show YYYY-MM-DD, excluding 00:00:00 + + num_chars <- max(nchar(as.character(dt))) if(num_chars <= 10){ # assumes dates in format YYYY-MM-DD - dates <- lubridate::ymd(sort(unique(df$ds))) + if(inherits(df$ds, "character")){ + dates <- lubridate::ymd(sort(unique(df$ds))) + }else{ + dates <- sort(unique(df$ds)) + } dates_diff <- diff(dates) dates_table <- table(dates_diff) mode <- as.numeric(names(which.max(dates_table))) @@ -28,9 +40,9 @@ infer_frequency <- function(df, freq){ freq_list = list( list(alias = "Y", value = c(365,366)), list(alias = "Q", value = c(91,92)), - list(alias = "MS", value = c(30,31)), + list(alias = "M", value = c(30,31)), list(alias = "W", value = c(7)), - list(alias = "D", value = c(1)) + list(alias = "D", value = c(24,1)) ) for(item in freq_list){ @@ -45,7 +57,11 @@ infer_frequency <- function(df, freq){ }else{ # assumes dates in format YYYY-MM-DD hh:mm:ss - dates <- lubridate::ymd_hms(sort(unique(df$ds))) + if(inherits(df$ds, "character")){ + dates <- lubridate::ymd_hms(sort(unique(df$ds))) + }else{ + dates <- sort(unique(df$ds)) + } dates_diff <- diff(dates) dates_table <- table(dates_diff) mode <- as.numeric(names(which.max(dates_table))) diff --git a/R/nixtla_client_forecast.R b/R/nixtla_client_forecast.R index 2e37322..80c4335 100644 --- a/R/nixtla_client_forecast.R +++ b/R/nixtla_client_forecast.R @@ -222,18 +222,34 @@ nixtla_client_forecast <- function(df, h=8, freq=NULL, id_col="unique_id", time_ } # Add unique ids and dates to forecast ---- - nch <- nchar(df_info$last_ds[1]) - if(nch <= 10){ - df_info$dates <- lubridate::ymd(df_info$last_ds) + if(inherits(df_info$last_ds, "character")){ + if(length(df_info$last_ds) > 1){ + dt <- sample(df_info$last_ds, 2) + }else{ + dt <- df_info$last_ds[1] + } + nch <- max(nchar(as.character(dt))) + if(nch <= 10){ + df_info$dates <- lubridate::ymd(df_info$last_ds) + }else{ + df_info$dates <- lubridate::ymd_hms(df_info$last_ds) + } }else{ - df_info$dates <- lubridate::ymd_hms(df_info$last_ds) + # assumes df_info$last_ds is already a date-object + df_info$dates <- df_info$last_ds } dates_df <- .generate_output_dates(df_info, freq, h) dates_long_df <- dates_df |> - tidyr::pivot_longer(cols = everything(), names_to = "unique_id", values_to = "ds") |> - dplyr::arrange(.data$unique_id) + tidyr::pivot_longer(cols = everything(), names_to = "unique_id", values_to = "ds") + + if(inherits(df$unique_id, "integer")){ + dates_long_df$unique_id <- as.numeric(dates_long_df$unique_id) + } + + dates_long_df <- dates_long_df |> + dplyr::arrange(.data$unique_id) forecast <- cbind(dates_long_df, fc) diff --git a/README.Rmd b/README.Rmd index 39f0d27..759a4e3 100644 --- a/README.Rmd +++ b/README.Rmd @@ -41,7 +41,7 @@ We are excited to announce the release of `nixtlar` version 0.6.0, which integra - **Date Formats**: For efficiency, `nixtlar` now strictly requires dates to be in the format `YYYY-MM-DD` or `YYYY-MM-DD hh:mm:ss`, either as character strings or date-time objects. For more details, please refer to our [Get Started](https://nixtla.github.io/nixtlar/articles/get-started.html) guide and [Data Requirements](https://nixtla.github.io/nixtlar/articles/data-requirements.html) vignette. -- **Default ID Column**: In alignment with the Python SDK, `nixtlar` now defaults the `id_col` to `unique_id`. This means you no longer need to specify this column if it is already named `unique_id`. If your dataset contains only one series, simply set `id_col=NULL`. +- **Default ID Column**: In alignment with the Python SDK, `nixtlar` now defaults the `id_col` to `unique_id`. This means you no longer need to specify this column if it is already named `unique_id`. If your dataset contains only one series, simply set `id_col=NULL`. The `id_col` only accepts characters or integers. These changes leverage the capabilities of `TimeGPT`'s new API and align `nixtlar` more closely with the Python SDK, ensuring a better user experience. diff --git a/vignettes/data-requirements.Rmd b/vignettes/data-requirements.Rmd index 57e3da9..7453b0e 100644 --- a/vignettes/data-requirements.Rmd +++ b/vignettes/data-requirements.Rmd @@ -60,6 +60,8 @@ head(df) str(df) ``` +The `id_col` only accepts characters or integers. + ## 3. Exogenous Variables When using exogenous variables, `nixtlar` differentiates between historical and future exogenous variables: