From ae3ce47460d15353a9305332d71dfd8b86daf747 Mon Sep 17 00:00:00 2001
From: hagne <hagen@hagnet.net>
Date: Mon, 3 Jun 2024 15:38:01 -0600
Subject: [PATCH] autocorrelation adaptation, qq test update

---
 atmPy/general/statistics.py | 353 +++++++++++++++++++++++++++++++-----
 1 file changed, 304 insertions(+), 49 deletions(-)

diff --git a/atmPy/general/statistics.py b/atmPy/general/statistics.py
index fe5f6ef..3b35489 100644
--- a/atmPy/general/statistics.py
+++ b/atmPy/general/statistics.py
@@ -5,7 +5,10 @@
 # from matplotlib.dates import DateFormatter as _DateFormatter
 from matplotlib.ticker import FuncFormatter as _FuncFormatter
 from matplotlib.ticker import MultipleLocator as _MultipleLocator
+import matplotlib.patches as mpatches
+from matplotlib.legend_handler import HandlerTuple
 import matplotlib.colors as _mcolors
+colors = _plt.rcParams['axes.prop_cycle'].by_key()['color']
 #import plt_tools as _plt_tools
 from atmPy.tools import array_tools as _array_tools
 import atmPy.tools.plt_tool_kit.colors as _atmcols
@@ -18,6 +21,7 @@
 import matplotlib.dates as _mpldates
 
 from statsmodels.graphics.tsaplots import plot_acf as statsmod_plot_acf
+import statsmodels.api as sm
 
 import scipy.stats as scistats
 
@@ -67,7 +71,7 @@ def __init__(self, parent_stats = None):
         self._trend_nsplines = None
         self._prediction_grid_size = None
         self._prediction_confidence = None
-        
+        self._autocorrelation = None
         self._rerun()
         
     def _rerun(self):
@@ -388,13 +392,111 @@ def prediction(self):
     @prediction.setter 
     def prediction(self, value):
         self._prediction = value
+
+    @property
+    def autocorrelation(self):
+        if isinstance(self._autocorrelation, type(None)):
+            self.autocorrelation_create_custom()
+        return self._autocorrelation
+        
+    def autocorrelation_create_custom(self, lagmax = None, z_score = 1.96):
+        """
+        Two confidence intervals are given:
+            Simple: based on number of datapoints in each correlation. Note, numbers are getting less with increasing lag.
+            Bartlett’s formula: Brockwell and Davis, 1987. Time Series Theory and Methods. This takes varibiltiy of acf into account.
+        """
+        autocorr = [1]
+        residuals = self.prediction.residual 
+        acfconf = [_np.sqrt(1/ residuals.shape[0]) * z_score]
+        lags = [0,]
+        ci_bartlett = [_np.sqrt(1/ residuals.shape[0]) * z_score]
+        # ci_lower_bartlett = []
+        
+        lag = 0
+        # lagmax = None
+        while 1:
+            lag += 1
+            lags.append(lag)
+            dt = _np.array([residuals[:-lag], residuals[lag:]])
+            ac = _np.corrcoef(dt)[0, 1]
+            autocorr.append(ac)
+            
+            ci = _np.sqrt(1/ (dt.shape[1])) * z_score
+            acfconf.append(ci)
+        
+        
+            # for k in range(len(acf_values)):
+            n = residuals.shape[0]
+            # n = dt.shape[1]
+            bartlett_se = _np.sqrt((1 + 2 * _np.sum(_np.array(autocorr)[1:]**2)) / n)
+            ci_b = 1.96 * bartlett_se
+            ci_bartlett.append(ci_b)
+        
+            # max lag test parameter
+            # if len(autocorr) >= 3:
+            maxlagparam =  max(autocorr[-3:])
+            # else:
+                # maxlagparam = ac
+            # print(acfconf[-3:])
+            # print(maxlagparam)
+            if not isinstance(lagmax, type(None)):
+              if lag >= lagmax:
+                  break
+                  
+            elif ci_b > maxlagparam:
+                lagmax_above_ci = lag - 1
+                lagmax = int(lag * 1.5)
+    
+        acfdf = _pd.DataFrame({'acf': autocorr, 'ci': acfconf, 'ci_bartlett': ci_bartlett}, index = lags)
+        acfdf.index.name = 'lag'
+        acfdf.columns.name = 'acf_params'
+        self.prediction['autocorrelation'] = acfdf
+        self.prediction.autocorrelation.attrs['lagmax'] = lagmax
+        self.prediction.autocorrelation.attrs['lagmax_above_ci'] = lagmax_above_ci
+        self._autocorrelation = self.prediction.autocorrelation
+        return self.prediction.autocorrelation
+
+    @property
+    def standard_error_adjustment_autocorr(self):
+        self.autocorrelation #make sure it is generated
+        autocorrs = self.prediction.autocorrelation.sel(acf_params = 'acf')[:self.prediction.autocorrelation.lagmax_above_ci]
+        rho_sum = autocorrs.sum()
+        n = self.gam_inst.statistics_['n_samples']
+        n_eff = n / (1 + 2 * rho_sum)
+        adjustment = float(_np.sqrt(n/n_eff))
+        self.prediction.autocorrelation.attrs['sandard_error_adjustment_autocorr'] = adjustment
+        return adjustment
+
+    @property
+    def standard_error_adjustment_newey_west(self):
+        """I think this only works for linear term, because of the self.gam_inst.statistics_['se'][0] below"""
+        self.autocorrelation #make sure it is generated
+        data = _pd.DataFrame({'residuals': self.prediction.residual.values, 'x':self.data['x_dsincestart'].values})
+        data['const'] = 1
+        
+        model = sm.OLS(data['residuals'], data[['const', 'x']])
+        maxlags = self.prediction.autocorrelation.lagmax_above_ci
+        result = model.fit(cov_type='HAC', cov_kwds={'maxlags': maxlags})
     
+        sandard_error_adjustment_newey_west = result.bse.x/self.gam_inst.statistics_['se'][0]
+        self.prediction.autocorrelation.attrs['sandard_error_adjustment_newey_west'] = sandard_error_adjustment_newey_west
+        return sandard_error_adjustment_newey_west
+
+    @property
+    def confidence_interval_adjusted_newey_wet(self):
+        bla   = (self.prediction.partial_datetime_confidence - self.prediction.partial_datetime)
+        bla  *=  self.standard_error_adjustment_newey_west
+        bla  +=  self.prediction.partial_datetime
+        self.prediction['partial_datetime_confidence_adjusted_nw'] = bla
+        return bla
+        
     def plot_seasonality(self, ax = None,
                          offset = 0,
                          xticklablesmonth = True,
                          show_confidence = True,
                          orientation='horizontal',
                          transform = None,
+                         obs_gridsize = 50,
                          # vcenter = True,
                          **plot_kwargs):
         """"
@@ -430,16 +532,22 @@ def plot_seasonality(self, ax = None,
         if transform == 'log2lin':
             da = 10**da
             
-        # else: 
-        #     da = self.prediction.partial_doy
-        da.plot(ax = a, y = y,**plot_kwargs)   
-        
+        ##################################
+        #### plot prediction
+        ##############
+        da.plot(ax = a, y = y, zorder = 10, **plot_kwargs)   
+        g = a.get_lines()[-1]
+        col_major = g.get_color()
         # if transform == 'percent':
         #     da = (10**self.prediction.partial_doy - 1) * 100
         # else: 
         #     da = self.prediction.partial_doy
         # (da + offset).plot(ax = a, y = y,**plot_kwargs)   
-        
+
+
+        ###################################
+        #### conficence intervals
+        #############
         if show_confidence:
             bot,up = self.prediction.partial_doy_confidence.values.transpose()
             bot = bot + offset
@@ -448,11 +556,41 @@ def plot_seasonality(self, ax = None,
                 bot = (10**bot - 1) * 100
                 up = (10**up - 1) * 100
             if orientation == 'vertical':
-                fill = a.fill_betweenx(self.prediction.doy,bot,up, color = [0,0,0,0.3])
+                fill = a.fill_betweenx(self.prediction.doy,bot,up, 
+                                       # color = [0,0,0,0.3],
+                                       color = col_major, alpha = 0.4,
+                                       zorder = 5 )
             else: 
-                fill = a.fill_between(self.prediction.doy,bot,up, color = [0,0,0,0.3])
+                fill = a.fill_between(self.prediction.doy,bot,up, 
+                                      color = col_major, 
+                                      alpha = 0.4,
+                                      zorder = 5 )
         else:
             fill = None
+
+        ######################################
+        #### observations
+        ################
+        show_observations = True
+        hbin = None
+        if show_observations:
+            pdts = self.prediction.partial_datetime.to_pandas()
+            data = self.data.copy()
+            data['partial_dt'] = pdts
+            data = data.interpolate()
+            data['y_aod_tc'] = data.y_aod - data.partial_dt
+            cm = _plt.cm.Oranges_r
+            cm.set_under([0,0,0,0])
+            y = data.y_aod_tc + offset - self.prediction.intercept
+            hbin = a.hexbin(data.x_doy, y, linewidths=0.2, gridsize = obs_gridsize, vmin = 0.1, cmap = cm, zorder = 1)
+            hbin.set_clim(vmax = hbin.get_clim()[1] * 0.5)
+            colors = plt.cm.Oranges_r(np.linspace(0, 1, 100))
+            patch = [mpatches.Patch(facecolor=c, label=categories) for c in colors]
+            a.legend(handles=[patch], labels=['data density',], ncol=len(categories), fontsize='small',
+                       handler_map = {list: HandlerTuple(None)})
+        #######################################
+        #### Settings
+        #############
         
         if xticklablesmonth:
             if orientation == 'vertical':
@@ -466,7 +604,7 @@ def plot_seasonality(self, ax = None,
             axis.set_label_text('')
         # a.set_xlabel('')
         # a.set_xlabel('Day of year')
-        return f,a,fill
+        return f,a,fill, hbin
     
     def plot_prediction(self, ax = None, show_confidence=True, show_original_data=True, **plot_kwargs):
         if isinstance(ax, type(None)):
@@ -481,14 +619,20 @@ def plot_prediction(self, ax = None, show_confidence=True, show_original_data=Tr
         
 
         self.prediction.prediction.plot(ax = a, zorder = 3, label = 'prediction')
+        g = a.get_lines()[-1]
+        col_major = g.get_color()
         if show_confidence:
             a.fill_between(self.prediction.datetime_full, 
                            self.prediction.prediction_confidence.sel(boundary='top'), 
                            self.prediction.prediction_confidence.sel(boundary='bottom'), 
-                           alpha=0.5, zorder = 2, color = '0.5', label = 'confidence')
+                           alpha = 0.3, 
+                           zorder = 2, 
+                           # color = '0.5', 
+                           color = col_major,
+                           label = 'confidence')
         if show_original_data:
             data = self._parent_stats._parent_ts.data
-            a.plot(data.index, data.iloc[:,0], ls = '', marker = '.', markersize = 1, zorder = 1, label = 'observation', color = '0.7')
+            a.plot(data.index, data.iloc[:,0], ls = '', marker = '.', markersize = 1, zorder = 1, label = 'observation', color = '0.4')
         return f,a
 
     def plot_interactions(self, ax = None, xticklablesmonth = True, transform = None, cbkwargs = {}, offset = 'intercept'): 
@@ -539,7 +683,8 @@ def plot_interactions(self, ax = None, xticklablesmonth = True, transform = None
         return f,a, pc, cbar
         
     def plot_overview(self, axis = None, show_confidence = True, show_original_data = True, 
-                      shade_seasons = True, transform = None, offset_season = 0):
+                      shade_seasons = True, transform = None, offset_season = 0,
+                      season_obs_gridsize = (80, 25)):
         """
         Parameters
         ===========
@@ -604,7 +749,11 @@ def plot_overview(self, axis = None, show_confidence = True, show_original_data
         self.plot_trend(ax=a_trend, shade_seasons=shade_seasons, show_confidence=show_confidence, transform = transform)
 
 
-        self.plot_seasonality(ax=a_season, show_confidence = show_confidence, orientation=orientation, transform = transform, offset = offset_season)
+        self.plot_seasonality(ax=a_season, 
+                              show_confidence = show_confidence, 
+                              orientation=orientation, transform = transform, 
+                              offset = offset_season,
+                              obs_gridsize = season_obs_gridsize)
         
         if self.interactions:
             # a = aa[2,0]
@@ -618,11 +767,6 @@ def plot_overview(self, axis = None, show_confidence = True, show_original_data
             out.append(pc)
             out.append(cb)
             
-            
-        # a = aa[0]
-        # fillb = a.get_children()[2]
-        # fillb.set_color('0.3')
-        # fillb.set_zorder(10)
         if self.interactions:
             for a in (aa[0,0], aa[1,0]):
                 a.set_xlim(aa[2,0].get_xlim())
@@ -656,7 +800,9 @@ def plot_overview(self, axis = None, show_confidence = True, show_original_data
     
     def plot_trend(self, ax = None, 
                    shade_seasons = False, 
+                   show_obs = True, 
                    show_confidence = True,
+                   show_confidence_adjusted = True,
                    offset= 0,
                    transform = None,
                    **plot_kwargs):
@@ -698,7 +844,7 @@ def plot_trend(self, ax = None,
             plot_kwargs['label'] = 'seasonal'
             
         if 'zorder' not in plot_kwargs:
-            plot_kwargs['zorder'] = 3
+            plot_kwargs['zorder'] = 5
         
         if isinstance(ax, type(None)):
             f, a = _plt.subplots()
@@ -711,20 +857,72 @@ def plot_trend(self, ax = None,
             da = (10**self.prediction.partial_datetime -1) * 100
         else:
             da = self.prediction.partial_datetime 
-        (da + offset).plot(ax = a, **plot_kwargs)   
 
+          
         
+        ###################
+        #### Plot trend
+        (da + offset).plot(ax = a, **plot_kwargs)   
+        g = a.get_lines()[-1]
+        col_major = g.get_color()
+        
+        ##################
+        #### Seasonal corrected data
+        if show_obs:
+            residual = self.prediction.residual
+            dat = (da + offset)
+            residual_plustrend = dat.interp(datetime = self.prediction.datetime_full) + residual
+            if 1:
+                g, = residual_plustrend.plot(ax = a, ls = '', marker = '.', markersize = 0.8, zorder = 1)
+                g.set_color('0.4')
+            else:
+                scale = 1
+                qm = a.hexbin(mdates.date2num(residual_plustrend.datetime), residual_plustrend, 
+                               # gridsize = (scale*50,scale*100), 
+                              
+                               linewidth = 0.2)
+                cm = plt.cm.inferno_r
+                cm = plt.cm.gnuplot2
+                cm.set_under([1,1,1,0])
+                qm.set_cmap(cm)
+                qm.set_clim(vmin = 0.1)
+        
+        ###################
+        ### confidence inervals
         if show_confidence:
             if transform == 'percent':
                 top = (10**self.prediction.partial_datetime_confidence.sel(boundary='top') -1) * 100
                 bottom = (10**self.prediction.partial_datetime_confidence.sel(boundary='bottom') -1) * 100
             else:                
                 top = self.prediction.partial_datetime_confidence.sel(boundary='top') + offset
-                bottom = self.prediction.partial_datetime_confidence.sel(boundary='bottom') + offset
+                bottom = self.prediction.partial_datetime_confidence.sel(boundary='bottom') + offset                
+            
+            a.fill_between(self.prediction.datetime, 
+                           top, 
+                           bottom, 
+                           alpha=0.3, 
+                           zorder = 3, 
+                           # color = '0.5',
+                           color = col_major,
+                           label = 'ci')
+
+        if show_confidence_adjusted:
+            # if transform == 'percent':
+            #     top = (10**self.prediction.partial_datetime_confidence.sel(boundary='top') -1) * 100
+            #     bottom = (10**self.prediction.partial_datetime_confidence.sel(boundary='bottom') -1) * 100
+            # else:         
+            cia = self.confidence_interval_adjusted_newey_wet
+            top = cia.sel(boundary='top') + offset
+            bottom = cia.sel(boundary='bottom') + offset                
+            
             a.fill_between(self.prediction.datetime, 
-                   top, 
-                   bottom, 
-                   alpha=0.5, zorder = 2, color = '0.5', label = 'confidence')
+                           top, 
+                           bottom, 
+                           alpha=0.3, 
+                           zorder = 2, 
+                           # color = '0.7', 
+                           color = '#ff7f0e',
+                           label = 'ci adjusted')
             
             
         if shade_seasons:
@@ -789,7 +987,8 @@ def plot_trend(self, ax = None,
     def plot_test_residual_vs_obs(self, 
                                   ul=0.999,
                                   ll=0.001, 
-                                  ax=None):
+                                  ax=None,
+                                  gridsize = 100):
         if not isinstance(ax, type(None)):
             a = ax
             f = a.get_figure()
@@ -814,6 +1013,7 @@ def plot_test_residual_vs_obs(self,
                       vmin = 0.000000001, 
                       linewidths=0.2, 
                       extent=extent,
+                      gridsize = gridsize,
                      )
         f.colorbar(pc)
         
@@ -840,10 +1040,10 @@ def plot_test_dist_of_res(self, bins = 50, ax = None):
         
         a.set_title('Distribution of residual')
         a.set_xlabel('Residual')
-        
+        a.axvline(0, color = [0,0,0,0.5], ls = '--')
         return f,a 
     
-    def plot_test_qq(self, ax = None, textpos = (0.1, 0.9), sig_fig = 2):
+    def plot_test_qq(self, ax = None, textpos = (0.1, 0.9), sig_fig = 2, gridsize = 50):
         def format_significant(x, sig_digits=2):
             if x == 0:
                 return f"{0:.{sig_digits-1}f}"  # handle zero specially
@@ -878,15 +1078,46 @@ def format_significant(x, sig_digits=2):
         gs = a.get_lines()
         gs[0].remove()
         ((osm, osr),(slope, intercept, r)) = out
-        cmap = _plt.cm.gnuplot2
+        self.prediction.residual.attrs['qq_intercept'] = intercept
+        self.prediction.residual.attrs['qq_correlation'] = r
+        # cmap = _plt.cm.gnuplot2
+        cmap = _plt.cm.plasma
         cmap.set_under(color='none')
-        a.hexbin(osm, osr, cmap = cmap, vmin = 0.000000001, linewidths=0.2
+        pc = a.hexbin(osm, osr, cmap = cmap, vmin = 0.000000001, linewidths=0.2, gridsize = gridsize,
                  # zorder = 10
                 )
-        a.grid()
-        a.text(*textpos, f'm = {format_significant(slope, sig_fig)}\nc = {format_significant(intercept, sig_fig)}\nr={format_significant(r, sig_fig)}',  transform = a.transAxes, 
-               ha = 'left', va= 'top')
-        return f,a 
+        # a.grid()
+        # if 0:
+        #     a.text(*textpos, 
+        #            (#f'm = {format_significant(slope, sig_fig)}\n'
+        #             f'intercept $c$ = {format_significant(intercept, sig_fig)}\n'
+        #             f'correlation $r$={format_significant(r, sig_fig)}'),
+        #             transform = a.transAxes, 
+        #             ha = 'left', va= 'top')
+        
+        g = a.get_lines()[-1]
+        # g.set_label(('linear reg., '
+        #              f'$c$ = {format_significant(intercept, sig_fig)}, '
+        #              f'$r$={format_significant(r, sig_fig)}'),
+        #            )
+        pcolors = pc.get_cmap()(_np.linspace(0, 1, 100))
+        patch = [mpatches.Patch(facecolor=c) for c in pcolors]
+        
+        handles=[patch,g]
+        glabel = ('linear reg.\n'
+                 f'$c$ = {format_significant(intercept, sig_fig)}\n'
+                 f'$r$ = {format_significant(r, sig_fig)}')
+        labels=[r'$\rho$(data)', glabel]
+        
+        leg = a.legend(handles, labels, 
+                       # ncol=len(handles), 
+                         fontsize='small',
+                       handler_map = {list: HandlerTuple(None)},
+                       loc = 4)
+        col = [0,0,0,.5]
+        a.axvline(0, ls = '--', color = col)
+        a.axhline(0, ls = '--', color = col)
+        return f,a,pc, leg
     
     def plot_test_residual(self, window = 2*365, ax = None, show_std = True, gridsize = 80):
         if not isinstance(ax, type(None)):
@@ -910,14 +1141,16 @@ def plot_test_residual(self, window = 2*365, ax = None, show_std = True, gridsiz
                 aa = [a,]
                 
         # residual = self.data.iloc[:,0] - self.prediction.prediction
-        residual = self.prediction.residual
-        residualdf = _pd.DataFrame(residual)
-        roll = residualdf.rolling(window, center = True, win_type='gaussian')
+        residual = self.prediction.residual.to_dataframe()
+        # residualdf = _pd.DataFrame(residual)
+        # residualdf = residual.to_dataframe()
+        
+        roll = residual.rolling(window, center = True, win_type='gaussian')
 
        
         
         a = aa[0]
-        out = a.hexbin(mdates.date2num(residual.datetime_full), residual, linewidths=0.2,
+        out = a.hexbin(mdates.date2num(residual.index), residual.residual, linewidths=0.2,
                        gridsize = gridsize
                       )
         qm = out
@@ -952,27 +1185,49 @@ def plot_test_residual(self, window = 2*365, ax = None, show_std = True, gridsiz
         
         return f,a 
     
-    def plot_test_autocorr(self, acf_kwargs = {'lags':60}, ax = None):
+    def plot_test_autocorr(self, 
+                           lag = None,
+                           # acf_kwargs = {'lags':60}, 
+                           ax = None):
         """
         Parameters
         -----------
-        acf_kwargs: see https://www.statsmodels.org/stable/generated/statsmodels.graphics.tsaplots.plot_acf.html
+        lag: int
+            Maximum lag. Note this is limited what has been created with autocorrelation_create_custom. Rerun
+            autocorrelation_create_custom with adjusted parameters if neede.
         """
         if not isinstance(ax, type(None)):
             a = ax
             f = a.get_figure()
         else:
             f,a = _plt.subplots()
-        # residual = self.data.iloc[:,0] - self.prediction.prediction
-        residual = self.prediction.residual
-        # _plt.figure(figsize=(10, 5))  # Set the figure size for better readability
-        statsmod_plot_acf(residual, ax = a, **acf_kwargs,
-                 # lags=160, 
-                 # use_vlines=True,
-                 # alpha=0.05
-                )  # Adjust 'lags' as necessary
-        a.set_title('Autocorrelation Function')
-        a.set_ylim(auto = True)
+            
+        acf = self.autocorrelation
+
+        sel = acf.sel(acf_params = 'acf')
+        sel.plot(ax = a, ls = '', marker = '.')
+        g = a.get_lines()[-1]
+        g.set_label('Autocorrelation function')
+
+        a.vlines(acf.lag, 0, sel, alpha = 0.7)
+        
+        sel = acf.sel(acf_params = 'ci_bartlett')
+        out = a.fill_between(acf.lag, sel, -sel, alpha = 0.5)
+        out.set_label('Confidence interval')       
+        
+        a.axhline(0, ls = '--', color = '0.5')
+        a.set_title('Autocorrelation function')
+        a.set_ylabel('Autocorrelation $r$')
+        a.set_xlabel('Lag')
+        a.legend()
+
+        
+        # acf_kwargs: see https://www.statsmodels.org/stable/generated/statsmodels.graphics.tsaplots.plot_acf.html
+        # residual = self.prediction.residual
+        # statsmod_plot_acf(residual, ax = a, **acf_kwargs,
+        #         )  # Adjust 'lags' as necessary
+        # a.set_title('Autocorrelation Function')
+        # a.set_ylim(auto = True)
         return f,a 
     
     def plot_test_obs_vs_pred(self, ll = 0.05, ul = 0.95, ax = None):