fixes #57 doc updates

zillow · Aug 24, 2020 · bcbbf9d · bcbbf9d
1 parent 0c203c2
commit bcbbf9d
Show file tree

Hide file tree

Showing 8 changed files with 66 additions and 59 deletions.
diff --git a/docs/tutorial/optimization.rst b/docs/tutorial/optimization.rst
@@ -67,4 +67,6 @@ This trained model is now ready to be used for scoring future data points.
 >>> trained_model.score(2000, '2020-06-08')
 {'Success': True, 'IsLogTransformed': 1, 'LogTransformedAdjustedActual': 7.601402334583733, 'LogTransformedPrediction': 7.529710533463001, 'LogTransformedStdErr': 0.06217883425408564, 'LogTransformedCILower': 7.422390543346913, 'LogTransformedCIUpper': 7.62662106869458, 'AdjustedActual': 2000.000000000015, 'Prediction': 1861.566274906425, 'StdErr': 110.9167321105633, 'CILower': 1672.028177505716, 'CIUpper': 2051.104372307134, 'ConfLevel': 90.0, 'ExogenousHolidays': 0, 'IsAnomaly': False, 'IsAnomalyExtreme': False, 'AnomalyProbability': 0.7545715087682185, 'DownAnomalyProbability': 0.12271424561589073, 'UpAnomalyProbability': 0.8772857543841093, 'ModelFreshness': 0.1}
 
+.. Note :: Configuration optimization is an expensive process. This should be triggered when training runs for the first time on a new data. This optimal configuration can be stored to be used for training in future with newer data points. The optimization should be called again only when there is any change in the structural pattern of the data or the model starts under-performing.
+
 
diff --git a/docs/tutorial/outlier_batch.rst b/docs/tutorial/outlier_batch.rst
@@ -73,7 +73,7 @@ Filtering requires very minimal specification in terms of configurations. The us
 >>> print(lad_filter_obj)
 <luminaire_models.model.lad_filtering.LADFilteringModel object at 0x7fd2b1832dd8>
 
-Similar to the structural model, the user can specify the *preprocessing_parameters* (see `Structural Modeling Tutorial <https://zillow.github.io/luminaire/basic_usage_tutorial/outlier_batch.html#anomaly-detection-using-structural-model>`_ for further information). These *preprocessing_parameters* are required to train the Luminaire filtering model.
+Similar to the structural model, the user can specify the *preprocessing_parameters* (see `Structural Modeling Tutorial <https://zillow.github.io/luminaire/tutorial/outlier_batch.html#anomaly-detection-using-structural-model>`_ for further information). These *preprocessing_parameters* are required to train the Luminaire filtering model.
 
 >>> success, model_date, model = lad_filter_obj.train(data=data, **pre_prc)
 >>> print(success, model_date, model)

diff --git a/docs/tutorial/streaming.rst b/docs/tutorial/streaming.rst
@@ -11,7 +11,7 @@ Although *WindowDensityModel* is designed to track anomalies over streaming data
 Anomaly Detection: Pre-Configured Settings
 ------------------------------------------
 
-Luminaire provides the capability to configure model parameters based on the frequency that the data has been observed and the methods that can be applied (please refer to the Window density Model user guide for detailed configuration options). Luminaire settings for the window density model are already pre-configured for some typical pandas frequency types and settings for any other frequency types should be configured manually (see the user guide for `Streaming Anomaly Detection Models <https://zillow.github.io/luminaire/user_guide/streaming.html>`_).
+Luminaire provides the capability to configure model parameters based on the frequency that the data has been observed and the methods that can be applied (please refer to the Window density Model user guide for detailed configuration options). Luminaire settings for the window density model are already pre-configured for some typical pandas frequency types and settings for any other frequency types should be configured manually (see the API reference for `Streaming Anomaly Detection Models <https://zillow.github.io/luminaire/api_reference/streaming.html>`_).
 
 >>> from luminaire.model.window_density import WindowDensityHyperParams, WindowDensityModel
 >>> print(data)
@@ -52,17 +52,17 @@ In order to score a new window innovation given the trained model object, we hav
 >>> scoring_data
                         raw interpolated
 index                                     
-2020-06-17 00:00:00  1121.0       1121.0
-2020-06-17 00:01:00  1091.0       1091.0
-2020-06-17 00:02:00  1063.0       1063.0
-2020-06-17 00:03:00  1085.0       1085.0
-2020-06-17 00:04:00  1063.0       1063.0
+2020-06-17 00:00:00  11021.0       11021.0
+2020-06-17 00:01:00  10931.0       10931.0
+2020-06-17 00:02:00  10637.0       10637.0
+2020-06-17 00:03:00  10845.0       10845.0
+2020-06-17 00:04:00  10163.0       10163.0
 ...                     ...          ...
-2020-06-17 23:55:00   968.0        968.0
-2020-06-17 23:56:00   995.0        995.0
-2020-06-17 23:57:00   963.0        963.0
-2020-06-17 23:58:00   968.0        968.0
-2020-06-17 23:59:00   920.0        920.0
+2020-06-17 23:55:00   9680.0        9680.0
+2020-06-17 23:56:00   9985.0        9985.0
+2020-06-17 23:57:00   9363.0        9363.0
+2020-06-17 23:58:00   9686.0        9686.0
+2020-06-17 23:59:00   9220.0        9220.0
 
 >>> scores = model.score(scoring_data)
 >>> print(scores)

diff --git a/luminaire/exploration/data_exploration.py b/luminaire/exploration/data_exploration.py
@@ -17,22 +17,24 @@ class DataExploration(object):
         'W-FRI', 'W-SAT'.
     :param float sig_level: The significance level to use for any statistical test withing data profile. This should be
         a number between 0 and 1.
-    :param min_ts_mean: The minimum mean value of the time series required for the model to run. For data that
+    :param float min_ts_mean: The minimum mean value of the time series required for the model to run. For data that
         originated as integers (such as counts), the ARIMA model can behave erratically when the numbers are small. When
         this parameter is set, any time series whose mean value is less than this will automatically result in a model
         failure, rather than a mostly bogus anomaly.
-    :param fill_rate: Minimum proportion of data availability in the recent data window.
-    :param max_window_size: The maximum size of the sub windows for input data segmentation.
-    :param window_size: The size of the sub windows for input data segmentation.
-    :param min_ts_length: The minimum required length of the time series for training.
-    :param max_ts_length: The maximum required length of the time series for training.
-    :param is_log_transformed: A flag to specify whether to take a log transform of the input data. If the data
+    :param float fill_rate: Minimum proportion of data availability in the recent data window. Should be a fraction
+        between 0 and 1.
+    :param int max_window_size: The maximum size of the sub windows for input data segmentation.
+    :param int window_size: The size of the sub windows for input data segmentation.
+    :param int min_ts_length: The minimum required length of the time series for training.
+    :param int max_ts_length: The maximum required length of the time series for training.
+    :param bool is_log_transformed: A flag to specify whether to take a log transform of the input data. If the data
         contain negatives, is_log_transformed is ignored even though it is set to True.
-    :param data_shift_truncate: A flag to specify whether left side of the most recent change point needs to
+    :param bool data_shift_truncate: A flag to specify whether left side of the most recent change point needs to
         be truncated from the training data.
-    :param min_changepoint_padding_length: A padding length between two change points. This parameter makes sure
+    :param int min_changepoint_padding_length: A padding length between two change points. This parameter makes sure
         that two consecutive change points are not close to each other.
-    :param change_point_threshold: Minimum threshold (a value > 0) to flag change points based on KL divergence.
+    :param float change_point_threshold: Minimum threshold (a value > 0) to flag change points based on KL divergence.
+        This parameter can be used to tune the sensitivity of the change point detection method.
 
     .. _Pandas offset: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
     """
@@ -766,7 +768,7 @@ def profile(self, df, impute_only=False, **kwargs):
         series model training.
 
         :param list/pandas.DataFrame df: Input time series.
-        :param impute_only: Flag to perform preprocessing until imputation OR full preprocessing.
+        :param bool impute_only: Flag to perform preprocessing until imputation OR full preprocessing.
         :return: Preprocessed dataframe with batch data summary.
         :rtype: tuple[pandas.dataFrame, dict]
 

diff --git a/luminaire/model/lad_filtering.py b/luminaire/model/lad_filtering.py
@@ -43,8 +43,9 @@ class LADFilteringModel(BaseModel):
     :param str freq: The frequency of the time-series. A `Pandas offset`_ such as 'D', 'H', or 'M'. Luminaire currently
         supports the following pandas frequency types: 'H', 'D', 'W', 'W-SUN', 'W-MON', 'W-TUE', 'W-WED', 'W-THU',
         'W-FRI', 'W-SAT'.
-    :param min_ts_length: The minimum required length of the time series for training.
-    :param max_ts_length: The maximum required length of the time series for training.
+    :param int min_ts_length: The minimum required length of the time series for training.
+    :param int max_ts_length: The maximum required length of the time series for training. The input time series will be
+        truncated if the length is greater than this value.
 
     .. Note :: This class should be used to manually configure the structural model. Exact configuration parameters
         can be found in `luminaire.model.lad_filtering.LADFilteringHyperParams`. Optimal configuration can be

diff --git a/luminaire/model/lad_structural.py b/luminaire/model/lad_structural.py
@@ -56,7 +56,8 @@ class LADStructuralModel(BaseModel):
         supports the following pandas frequency types: 'H', 'D', 'W', 'W-SUN', 'W-MON', 'W-TUE', 'W-WED', 'W-THU',
         'W-FRI', 'W-SAT'.
     :param int min_ts_length: The minimum required length of the time series for training.
-    :param int max_ts_length: The maximum required length of the time series for training.
+    :param int max_ts_length: The maximum required length of the time series for training. The input time series will be
+        truncated if the length is greater than this value.
     :param float min_ts_mean: Minimum average values in the most recent window of the time series. This optional
         parameter can be used to avoid over-alerting from noisy low volume time series.
     :param int min_ts_mean_window: Size of the most recent window to calculate min_ts_mean.

diff --git a/luminaire/model/window_density.py b/luminaire/model/window_density.py
@@ -20,8 +20,8 @@ class WindowDensityHyperParams(BaseModelHyperParams):
 
     :param str detection_method: A string that select between two window testing method. Possible values:
 
-        - "kldiv" (KL-divergence)
-        - "sign_test" (Wilcoxon sign rank test)
+        - "kldiv" (KL-divergence). This is recommended to be set for high frequency time series such as 'S', 'T' etc.
+        - "sign_test" (Wilcoxon sign rank test). This is recommended to be set for low frequency time series such as 'H', 'D' etc.
 
     :param int min_window_length: Minimum size of the scoring window / a stable training sub-window length.
     
@@ -39,8 +39,8 @@ class WindowDensityHyperParams(BaseModelHyperParams):
         moving average method.
     
         .. Note :: ma_window_length should be small enough to maintain the stable structure of the training / scoring window
-            and large enough to remove the trend. The ideal size can be somewhere between (0.1 * window_length) and
-            (0.25 * window length).
+            and large enough to remove the trend. The ideal size can be somewhere between (0.01 * window_length) and
+            (0.25 * window length) depending on the data frequency.
     
     :param str detrend_method: A string that select between two stationarizing method. Possible values:
 
@@ -119,7 +119,7 @@ def __init__(self,
 class WindowDensityModel(BaseModel):
     """
     This model detects anomalous windows using KL divergence (for high frequency data) and Wilcoxon sign rank test
-    (for low frequency data).
+    (for low frequency data). This default monitoring frequency is set to pandas time frequency type 'T'.
 
     :param dict hyper_params: Hyper parameters for Luminaire window density model.
         See :class:`luminaire.model.window_density.WindowDensityHyperParams` for detailed information.
@@ -414,7 +414,7 @@ def train(self, data, **kwargs):
         """
         Input time series for training.
 
-        :param data: Input time series.
+        :param pandas.DataFrame data: Input time series.
         :return: Training summary with a success flag.
         :rtype: tuple(bool, python model object)
 
@@ -640,30 +640,30 @@ def score(self, data, **kwargs):
         >>> data
                                 raw interpolated
         index
-        2018-10-06 00:00:00  204800       204800
-        2018-10-06 01:00:00  222218       222218
-        2018-10-06 02:00:00  218903       218903
-        2018-10-06 03:00:00  190639       190639
-        2018-10-06 04:00:00  148214       148214
-        2018-10-06 05:00:00  106358       106358
-        2018-10-06 06:00:00   70081        70081
-        2018-10-06 07:00:00   47748        47748
-        2018-10-06 08:00:00   36837        36837
-        2018-10-06 09:00:00   33023        33023
-        2018-10-06 10:00:00   44432        44432
-        2018-10-06 11:00:00   72773        72773
-        2018-10-06 12:00:00  115180       115180
-        2018-10-06 13:00:00  157568       157568
-        2018-10-06 14:00:00  180174       180174
-        2018-10-06 15:00:00  190048       190048
-        2018-10-06 16:00:00  188391       188391
-        2018-10-06 17:00:00  189233       189233
-        2018-10-06 18:00:00  191703       191703
-        2018-10-06 19:00:00  189848       189848
-        2018-10-06 20:00:00  192685       192685
-        2018-10-06 21:00:00  196743       196743
-        2018-10-06 22:00:00  193016       193016
-        2018-10-06 23:00:00  196441       196441
+        2018-10-11 00:00:00  204800       204800
+        2018-10-11 01:00:00  222218       222218
+        2018-10-11 02:00:00  218903       218903
+        2018-10-11 03:00:00  190639       190639
+        2018-10-11 04:00:00  148214       148214
+        2018-10-11 05:00:00  106358       106358
+        2018-10-11 06:00:00   70081        70081
+        2018-10-11 07:00:00   47748        47748
+        2018-10-11 08:00:00   36837        36837
+        2018-10-11 09:00:00   33023        33023
+        2018-10-11 10:00:00   44432        44432
+        2018-10-11 11:00:00   72773        72773
+        2018-10-11 12:00:00  115180       115180
+        2018-10-11 13:00:00  157568       157568
+        2018-10-11 14:00:00  180174       180174
+        2018-10-11 15:00:00  190048       190048
+        2018-10-11 16:00:00  188391       188391
+        2018-10-11 17:00:00  189233       189233
+        2018-10-11 18:00:00  191703       191703
+        2018-10-11 19:00:00  189848       189848
+        2018-10-11 20:00:00  192685       192685
+        2018-10-11 21:00:00  196743       196743
+        2018-10-11 22:00:00  193016       193016
+        2018-10-11 23:00:00  196441       196441
         >>> model
         <luminaire.model.window_density.WindowDensityModel object at 0x7fcaab72fdd8>
 

diff --git a/luminaire/optimization/hyperparameter_optimization.py b/luminaire/optimization/hyperparameter_optimization.py
@@ -14,10 +14,11 @@ class HyperparameterOptimization(object):
         'W-FRI', 'W-SAT'.
     :param str detection_type: Luminaire anomaly detection type. Only Outlier detection for batch data is currently
         supported.
-    :param min_ts_mean: Minimum average values in the most recent window of the time series. This optional parameter
+    :param float min_ts_mean: Minimum average values in the most recent window of the time series. This optional parameter
         can be used to avoid over-alerting from noisy low volume time series.
-    :param max_ts_length: The maximum required length of the time series for training.
-    :param min_ts_length: The minimum required length of the time series for training.
+    :param int max_ts_length: The maximum required length of the time series for training.
+    :param int min_ts_length: The minimum required length of the time series for training. The input time series will be
+        truncated if the length is greater than this value.
     :param int scoring_length: Number of innovations to be scored after training window with respect to the frequency.
 
     .. _Pandas offset: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
Original file line number	Diff line number	Diff line change
Expand Up		@@ -67,4 +67,6 @@ This trained model is now ready to be used for scoring future data points.
		>>> trained_model.score(2000, '2020-06-08')
		{'Success': True, 'IsLogTransformed': 1, 'LogTransformedAdjustedActual': 7.601402334583733, 'LogTransformedPrediction': 7.529710533463001, 'LogTransformedStdErr': 0.06217883425408564, 'LogTransformedCILower': 7.422390543346913, 'LogTransformedCIUpper': 7.62662106869458, 'AdjustedActual': 2000.000000000015, 'Prediction': 1861.566274906425, 'StdErr': 110.9167321105633, 'CILower': 1672.028177505716, 'CIUpper': 2051.104372307134, 'ConfLevel': 90.0, 'ExogenousHolidays': 0, 'IsAnomaly': False, 'IsAnomalyExtreme': False, 'AnomalyProbability': 0.7545715087682185, 'DownAnomalyProbability': 0.12271424561589073, 'UpAnomalyProbability': 0.8772857543841093, 'ModelFreshness': 0.1}

		.. Note :: Configuration optimization is an expensive process. This should be triggered when training runs for the first time on a new data. This optimal configuration can be stored to be used for training in future with newer data points. The optimization should be called again only when there is any change in the structural pattern of the data or the model starts under-performing.