From ad40a3b3ba20b17679b8b52de37dad6be3b071f3 Mon Sep 17 00:00:00 2001
From: sayanc <sayanc@zillowgroup.com>
Date: Tue, 8 Jun 2021 00:39:42 -0700
Subject: [PATCH 1/2] window size detection logic updated

---
 luminaire/exploration/data_exploration.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/luminaire/exploration/data_exploration.py b/luminaire/exploration/data_exploration.py
index 09d9f41..bc14288 100644
--- a/luminaire/exploration/data_exploration.py
+++ b/luminaire/exploration/data_exploration.py
@@ -1032,6 +1032,7 @@ def stream_profile(self, df, impute_only=False, **kwargs):
         import datetime
         import numpy as np
         import pandas as pd
+        from scipy import stats
 
         try:
             processed_df, freq = self._prepare(df, impute_only=impute_only, streaming=True, **kwargs)
@@ -1058,7 +1059,7 @@ def stream_profile(self, df, impute_only=False, **kwargs):
             if not self.window_length:
                 window_length_list = []
 
-                for i in range(20):
+                for i in range(100):
                     rand_date = sample(idx_date_list, 1)[0]
                     rand_start_idx = pd.Timestamp(datetime.datetime.combine(rand_date, training_start_time))
                     if rand_date in idx_date_list[:int(len(idx_date_list) / 2)]:
@@ -1070,7 +1071,18 @@ def stream_profile(self, df, impute_only=False, **kwargs):
                         else self.window_length
                     window_length_list.append(window_length_i)
 
-                window_length = int(np.median(window_length_list))
+                window_length_list = np.array(window_length_list)
+                if np.all(window_length_list == min(window_length_list)):
+                    window_length = window_length_list[0]
+                else:
+                    bin_count = max(1, int((max(window_length_list) - min(window_length_list)) / 12))
+                    bins = np.linspace(min(window_length_list) - 1, max(window_length_list) + 1, bin_count)
+                    if len(bins) == 1:
+                        window_length = int(stats.mode(window_length_list).mode[0])
+                    else:
+                        digitized = np.digitize(window_length_list, bins)
+                        arg_mode = np.argmax([len(window_length_list[digitized == i]) for i in range(1, len(bins))]) + 1
+                        window_length = int(stats.mode(window_length_list[digitized == arg_mode]).mode[0])
 
                 if window_length < self.min_window_length:
                     raise ValueError('Training window too small')

From 02c1b7912b90d6dccbe855fcf2991ab4abf9ca37 Mon Sep 17 00:00:00 2001
From: sayanc <sayanc@zillowgroup.com>
Date: Wed, 9 Jun 2021 12:44:14 -0700
Subject: [PATCH 2/2] comments added for the new window size detection logic.
 version updated for release

---
 luminaire/exploration/data_exploration.py | 5 +++++
 setup.py                                  | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/luminaire/exploration/data_exploration.py b/luminaire/exploration/data_exploration.py
index bc14288..3b4192d 100644
--- a/luminaire/exploration/data_exploration.py
+++ b/luminaire/exploration/data_exploration.py
@@ -1059,6 +1059,8 @@ def stream_profile(self, df, impute_only=False, **kwargs):
             if not self.window_length:
                 window_length_list = []
 
+                # If the window size is not specified, the following logic makes several random segments of the
+                # time series which obtains a list of optimal window sizes
                 for i in range(100):
                     rand_date = sample(idx_date_list, 1)[0]
                     rand_start_idx = pd.Timestamp(datetime.datetime.combine(rand_date, training_start_time))
@@ -1072,6 +1074,9 @@ def stream_profile(self, df, impute_only=False, **kwargs):
                     window_length_list.append(window_length_i)
 
                 window_length_list = np.array(window_length_list)
+
+                # From the list of optimal window sizes, if it is a list of constants, we take the constant as the
+                # window size. Otherwise, we obtain the window size that is most frequently observed in the list.
                 if np.all(window_length_list == min(window_length_list)):
                     window_length = window_length_list[0]
                 else:
diff --git a/setup.py b/setup.py
index b3cf700..100a4bb 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@
 
 setup(
     name='luminaire',
-    version='0.2.0',
+    version='0.2.1',
 
     license='Apache License 2.0',