From ad40a3b3ba20b17679b8b52de37dad6be3b071f3 Mon Sep 17 00:00:00 2001 From: sayanc Date: Tue, 8 Jun 2021 00:39:42 -0700 Subject: [PATCH 1/2] window size detection logic updated --- luminaire/exploration/data_exploration.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/luminaire/exploration/data_exploration.py b/luminaire/exploration/data_exploration.py index 09d9f41..bc14288 100644 --- a/luminaire/exploration/data_exploration.py +++ b/luminaire/exploration/data_exploration.py @@ -1032,6 +1032,7 @@ def stream_profile(self, df, impute_only=False, **kwargs): import datetime import numpy as np import pandas as pd + from scipy import stats try: processed_df, freq = self._prepare(df, impute_only=impute_only, streaming=True, **kwargs) @@ -1058,7 +1059,7 @@ def stream_profile(self, df, impute_only=False, **kwargs): if not self.window_length: window_length_list = [] - for i in range(20): + for i in range(100): rand_date = sample(idx_date_list, 1)[0] rand_start_idx = pd.Timestamp(datetime.datetime.combine(rand_date, training_start_time)) if rand_date in idx_date_list[:int(len(idx_date_list) / 2)]: @@ -1070,7 +1071,18 @@ def stream_profile(self, df, impute_only=False, **kwargs): else self.window_length window_length_list.append(window_length_i) - window_length = int(np.median(window_length_list)) + window_length_list = np.array(window_length_list) + if np.all(window_length_list == min(window_length_list)): + window_length = window_length_list[0] + else: + bin_count = max(1, int((max(window_length_list) - min(window_length_list)) / 12)) + bins = np.linspace(min(window_length_list) - 1, max(window_length_list) + 1, bin_count) + if len(bins) == 1: + window_length = int(stats.mode(window_length_list).mode[0]) + else: + digitized = np.digitize(window_length_list, bins) + arg_mode = np.argmax([len(window_length_list[digitized == i]) for i in range(1, len(bins))]) + 1 + window_length = int(stats.mode(window_length_list[digitized == arg_mode]).mode[0]) if window_length < self.min_window_length: raise ValueError('Training window too small') From 02c1b7912b90d6dccbe855fcf2991ab4abf9ca37 Mon Sep 17 00:00:00 2001 From: sayanc Date: Wed, 9 Jun 2021 12:44:14 -0700 Subject: [PATCH 2/2] comments added for the new window size detection logic. version updated for release --- luminaire/exploration/data_exploration.py | 5 +++++ setup.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/luminaire/exploration/data_exploration.py b/luminaire/exploration/data_exploration.py index bc14288..3b4192d 100644 --- a/luminaire/exploration/data_exploration.py +++ b/luminaire/exploration/data_exploration.py @@ -1059,6 +1059,8 @@ def stream_profile(self, df, impute_only=False, **kwargs): if not self.window_length: window_length_list = [] + # If the window size is not specified, the following logic makes several random segments of the + # time series which obtains a list of optimal window sizes for i in range(100): rand_date = sample(idx_date_list, 1)[0] rand_start_idx = pd.Timestamp(datetime.datetime.combine(rand_date, training_start_time)) @@ -1072,6 +1074,9 @@ def stream_profile(self, df, impute_only=False, **kwargs): window_length_list.append(window_length_i) window_length_list = np.array(window_length_list) + + # From the list of optimal window sizes, if it is a list of constants, we take the constant as the + # window size. Otherwise, we obtain the window size that is most frequently observed in the list. if np.all(window_length_list == min(window_length_list)): window_length = window_length_list[0] else: diff --git a/setup.py b/setup.py index b3cf700..100a4bb 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ setup( name='luminaire', - version='0.2.0', + version='0.2.1', license='Apache License 2.0',