From 6471a96ac5a0c6cae0646e9812d4206d75ab1544 Mon Sep 17 00:00:00 2001 From: Hannah Lu <30671575+hlu109@users.noreply.github.com> Date: Fri, 12 Aug 2022 19:33:07 -0400 Subject: [PATCH 01/28] add copy of code used in TRB paper --- TRB_label_assist/models.py | 2035 ++++++++++++++++++++++++++++++++++++ 1 file changed, 2035 insertions(+) create mode 100644 TRB_label_assist/models.py diff --git a/TRB_label_assist/models.py b/TRB_label_assist/models.py new file mode 100644 index 000000000..e5283d730 --- /dev/null +++ b/TRB_label_assist/models.py @@ -0,0 +1,2035 @@ +import pandas as pd +import numpy as np +from abc import ABCMeta, abstractmethod # to define abstract class "blueprints" +import logging +import copy + +# sklearn imports +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.metrics.pairwise import haversine_distances +from sklearn.cluster import DBSCAN +from sklearn import svm +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.exceptions import NotFittedError + +# our imports +from clustering import get_distance_matrix, single_cluster_purity +import data_wrangling +import emission.storage.decorations.trip_queries as esdtq +import emission.analysis.modelling.tour_model_first_only.build_save_model as bsm +import emission.analysis.modelling.tour_model_first_only.evaluation_pipeline as ep +from emission.analysis.classification.inference.labels.inferrers import predict_cluster_confidence_discounting +import emission.core.wrapper.entry as ecwe +import emission.analysis.modelling.tour_model_extended.similarity as eamts + +# logging.basicConfig(level=logging.DEBUG) + +EARTH_RADIUS = 6371000 + +############################# +## define abstract classes ## +############################# + + +class SetupMixin(metaclass=ABCMeta): + """ class containing code to be reused when setting up estimators. """ + + @abstractmethod + def set_params(self, params): + """ Set the parameters of the estimator. + + Args: + params (dict): dictionary where the keys are the param names + (strings) and the values are the parameter inputs + + Returns: + self + """ + raise NotImplementedError + + def _clean_data(self, df): + """ Clean a dataframe of trips. + (Drop trips with missing start/end locations, expand the user input + columns, ensure all essential columns are present) + + Args: + df: a dataframe of trips. must contain the columns 'start_loc', + 'end_loc', and should also contain the user input columns + ('mode_confirm', 'purpose_confirm', 'replaced_mode') if + available + """ + assert 'start_loc' in df.columns and 'end_loc' in df.columns + + # clean up the dataframe by dropping entries with NaN locations and + # reset index + num_nan = 0 + if df.start_loc.isna().any(): + num_nan += df.start_loc.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['start_loc']) + if df.end_loc.isna().any(): + num_nan += df.end_loc.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['end_loc']) + + # expand the 'start_loc' and 'end_loc' column into 'start_lat', + # 'start_lon', 'end_lat', and 'end_lon' columns + df = data_wrangling.expand_coords(df) + + # drop trips with missing coordinates + if df.start_lat.isna().any(): + num_nan += df.start_lat.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['start_lat']) + if df.start_lon.isna().any(): + num_nan += df.start_lon.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['start_lon']) + if df.end_lat.isna().any(): + num_nan += df.end_lat.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['end_lat']) + if df.end_lon.isna().any(): + num_nan = df.end_lon.value_counts(dropna=False).loc[np.nan] + df += df.dropna(subset=['end_lon']) + if num_nan > 0: + logging.info( + f'dropped {num_nan} trips that are missing location coordinates' + ) + + df = df.rename( + columns={ + 'mode_confirm': 'mode_true', + 'purpose_confirm': 'purpose_true', + 'replaced_mode': 'replaced_true' + }) + + for category in ['mode_true', 'purpose_true', 'replaced_true']: + if category not in df.columns: + # for example, if a user labels all their trip modes but none of their trip purposes + df.loc[:, category] = np.nan + + return df.reset_index(drop=True) + + +class Cluster(SetupMixin, metaclass=ABCMeta): + """ blueprint for clustering models. """ + + @abstractmethod + def fit(self, train_df): + """ Fit the clustering algorithm. + + Args: + train_df (DataFrame): dataframe of labeled trips + + Returns: + self + """ + raise NotImplementedError + + @abstractmethod + def predict(self, test_df): + """ Predict cluster indices for trips, if possible. Trips that could + not be clustered will have the index -1. + + Args: + test_df (DataFrame): dataframe of test trips + + Returns: + pd DataFrame containing one column, 'start_cluster_idx' or + 'end_cluster_idx' + """ + raise NotImplementedError + + def fit_predict(self, train_df): + """ Fit the clustering algorithm and predict cluster indices for trips, + if possible. Trips that could not be clustered will have the index -1. + + Args: + train_df (DataFrame): dataframe of labeled trips + + Returns: + pd DataFrame containing one column, 'start_cluster_idx' or + 'end_cluster_idx' + """ + self.fit(train_df) + return self.predict(train_df) + + +class TripClassifier(SetupMixin, metaclass=ABCMeta): + + @abstractmethod + def fit(self, train_df): + """ Fit a classification model. + + Args: + train_df (DataFrame): dataframe of labeled trips + + Returns: + self + """ + raise NotImplementedError + + def predict(self, test_df): + """ Predict trip labels. + + Args: + test_df (DataFrame): dataframe of trips + + Returns: + DataFrame containing the following columns: + 'purpose_pred', 'mode_pred', 'replaced_pred', + 'purpose_proba', 'mode_proba', 'replaced_proba' + the *_pred columns contain the most-likely label prediction + (string for a label or float for np.nan). + the *_proba columns contain the probability of the most-likely + prediction. + """ + proba_df = self.predict_proba(test_df) + prediction_df = proba_df.loc[:, [('purpose', 'top_pred'), + ('purpose', 'top_proba'), + ('mode', 'top_pred'), + ('mode', 'top_proba'), + ('replaced', 'top_pred'), + ('replaced', 'top_proba')]] + + prediction_df.columns = prediction_df.columns.to_flat_index() + prediction_df = prediction_df.rename( + columns={ + ('purpose', 'top_pred'): 'purpose_pred', + ('purpose', 'top_proba'): 'purpose_proba', + ('mode', 'top_pred'): 'mode_pred', + ('mode', 'top_proba'): 'mode_proba', + ('replaced', 'top_pred'): 'replaced_pred', + ('replaced', 'top_proba'): 'replaced_proba', + }) + + return prediction_df + + def fit_predict(self, train_df): + """ Fit a classification model and predict trip labels. + + Args: + train_df (DataFrame): dataframe of labeled trips + + Returns: + DataFrame containing the following columns: + 'purpose_pred', 'mode_pred', 'replaced_pred', + 'purpose_proba', 'mode_proba', 'replaced_proba' + the *_pred columns contain the most-likely label prediction + (string for a label or float for np.nan). + the *_proba columns contain the probability of the most-likely + prediction. + """ + self.fit(train_df) + return self.predict(train_df) + + @abstractmethod + def predict_proba(self, test_df): + """ Predict class probabilities for each trip. + + NOTE: check the specific model to see if the class probabilities + have confidence-discounting or not. + + Args: + test_df (DataFrame): dataframe of trips + + Returns: + DataFrame with multiindexing. Each row represents a trip. There + are 3 columns at level 1, one for each label category + ('purpose', 'mode', 'replaced'). Within each category, there is + a column for each label, with the row's entry being the + probability that the trip has the label. There are three + additional columns within each category, one indicating the + most-likely label, one indicating the probability of the + most-likely label, and one indicating whether or not the trip + can be clustered. + TODO: add a fourth optional column for the number of trips in + the cluster (if clusterable) + + Level 1 columns are: purpose, mode, replaced + Lebel 2 columns are: + , , ... top_pred, top_proba, clusterable + , , ... top_pred, top_proba, clusterable + , , ... top_pred, top_proba, clusterable + """ + raise NotImplementedError + + +######################## +## clustering classes ## +######################## + + +class RefactoredNaiveCluster(Cluster): + """ Naive fixed-width clustering algorithm. + Refactored from the existing Similarity class to take in dataframes for + consistency, and allows for separate clustering of start and end + clusters. + + WARNING: this algorithm is *extremely* slow. + + Args: + loc_type (str): 'start' or 'end', the type of point to cluster + radius (int): max distance between all pairs of points in a + cluster, i.e. strict maximum cluster width. + + Attributes: + loc_type (str) + radius (int) + train_df (DataFrame) + test_df (DataFrame) + sim_model (Similarity object) + """ + + def __init__(self, loc_type='end', radius=100): + self.loc_type = loc_type + self.radius = radius + + def set_params(self, params): + if 'loc_type' in params.keys(): self.loc_type = params['loc_type'] + if 'radius' in params.keys(): self.radius = params['radius'] + + return self + + def fit(self, train_df): + # clean data + self.train_df = self._clean_data(train_df) + + # we can use all trips as long as they have purpose labels. it's ok if + # they're missing mode/replaced-mode labels, because they aren't as + # strongly correlated with location compared to purpose + # TODO: actually, we may want to rethink this. for example, it will + # probably be helpful to include trips that are missing purpose labels + # but still have mode labels. + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + # fit the bins + self.sim_model = eamts.Similarity(self.train_df, + radius_start=self.radius, + radius_end=self.radius, + shouldFilter=False, + cutoff=False) + # we only bin the loc_type points to speed up the alg. avoid + # unnecessary binning since this is really slow + self.sim_model.bin_helper(loc_type=self.loc_type) + labels = self.sim_model.data_df[self.loc_type + '_bin'].to_list() + self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = labels + return self + + def predict(self, test_df): + self.test_df = self._clean_data(test_df) + + if self.loc_type == 'start': + bins = self.sim_model.start_bins + elif self.loc_type == 'end': + bins = self.sim_model.end_bins + + labels = [] + + # for each trip in the test list: + for idx, row in self.test_df.iterrows(): + # iterate over all bins + trip_binned = False + for i, bin in enumerate(bins): + # check if the trip can fit in the bin + # if so, get the bin index + if self._match(row, bin, self.loc_type): + labels += [i] + trip_binned = True + break + # if not, return -1 + if not trip_binned: + labels += [-1] + + self.test_df.loc[:, f'{self.loc_type}_cluster_idx'] = labels + + return self.test_df[[f'{self.loc_type}_cluster_idx']] + + def _match(self, trip, bin, loc_type): + """ Check if a trip can fit into an existing bin. + + copied from the Similarity class on the e-mission-server. + """ + for t_idx in bin: + trip_in_bin = self.train_df.iloc[t_idx] + if not self._distance_helper(trip, trip_in_bin, loc_type): + return False + return True + + def _distance_helper(self, tripa, tripb, loc_type): + """ Check if two trips have start/end points within the distance + threshold. + + copied from the Similarity class on the e-mission-server. + """ + pta_lat = tripa[[loc_type + '_lat']] + pta_lon = tripa[[loc_type + '_lon']] + ptb_lat = tripb[[loc_type + '_lat']] + ptb_lon = tripb[[loc_type + '_lon']] + + return eamts.within_radius(pta_lat, pta_lon, ptb_lat, ptb_lon, + self.radius) + + +class DBSCANSVMCluster(Cluster): + """ DBSCAN-based clustering algorithm that optionally implements SVM + sub-clustering. + + Args: + loc_type (str): 'start' or 'end', the type of point to cluster + radius (int): max distance between two points in each other's + neighborhood, i.e. DBSCAN's eps value. does not strictly + dictate final cluster size + size_thresh (int): the min number of trips a cluster must have + to be considered for SVM sub-division + purity_thresh (float): the min purity a cluster must have + to be sub-divided using SVM + gamma (float): coefficient for the rbf kernel in SVM + C (float): regularization hyperparameter for SVM + + Attributes: + loc_type (str) + radius (int) + size_thresh (int) + purity_thresh (float) + gamma (float) + C (float) + train_df (DataFrame) + test_df (DataFrame) + base_model (sklearn Estimator) + """ + + def __init__(self, + loc_type='end', + radius=100, + svm=True, + size_thresh=1, + purity_thresh=1.0, + gamma=0.05, + C=1): + self.loc_type = loc_type + self.radius = radius + self.svm = svm + self.size_thresh = size_thresh + self.purity_thresh = purity_thresh + self.gamma = gamma + self.C = C + + def set_params(self, params): + if 'loc_type' in params.keys(): self.loc_type = params['loc_type'] + if 'radius' in params.keys(): self.radius = params['radius'] + if 'svm' in params.keys(): self.svm = params['svm'] + if 'size_thresh' in params.keys(): + self.size_thresh = params['size_thresh'] + if 'purity_thresh' in params.keys(): + self.purity_thresh = params['purity_thresh'] + if 'gamma' in params.keys(): self.gamma = params['gamma'] + + return self + + def fit(self, train_df): + """ Creates clusters of trip points. + self.train_df will be updated with columns containing base and + final clusters. + + TODO: perhaps move the loc_type argument to fit() so we can use a + single class instance to cluster both start and end points. This + will also help us reduce duplicate data. + + Args: + train_df (dataframe): dataframe of labeled trips + """ + ################## + ### clean data ### + ################## + self.train_df = self._clean_data(train_df) + + # we can use all trips as long as they have purpose labels. it's ok if + # they're missing mode/replaced-mode labels, because they aren't as + # strongly correlated with location compared to purpose + # TODO: actually, we may want to rethink this. for example, it will + # probably be helpful to include trips that are missing purpose labels + # but still have mode labels. + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + ######################### + ### get base clusters ### + ######################### + dist_matrix_meters = get_distance_matrix(self.train_df, self.loc_type) + self.base_model = DBSCAN(self.radius, + metric="precomputed", + min_samples=1).fit(dist_matrix_meters) + base_clusters = self.base_model.labels_ + + self.train_df.loc[:, + f'{self.loc_type}_base_cluster_idx'] = base_clusters + + ######################## + ### get sub-clusters ### + ######################## + # copy base cluster column into final cluster column + self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = self.train_df[ + f'{self.loc_type}_base_cluster_idx'] + + if self.svm: + c = 0 # count of how many clusters we have iterated over + + # iterate over all clusters and subdivide them with SVM. the while + # loop is so we can do multiple iterations of subdividing if needed + while c < self.train_df[f'{self.loc_type}_cluster_idx'].max(): + points_in_cluster = self.train_df[ + self.train_df[f'{self.loc_type}_cluster_idx'] == c] + + # only do SVM if we have the minimum num of trips in the cluster + if len(points_in_cluster) < self.size_thresh: + c += 1 + continue + + # only do SVM if purity is below threshold + purity = single_cluster_purity(points_in_cluster, + label_col='purpose_true') + if purity < self.purity_thresh: + X = points_in_cluster[[ + f"{self.loc_type}_lon", f"{self.loc_type}_lat" + ]] + y = points_in_cluster.purpose_true.to_list() + + svm_model = make_pipeline( + StandardScaler(), + svm.SVC( + kernel='rbf', + gamma=self.gamma, + C=self.C, + )).fit(X, y) + labels = svm_model.predict(X) + unique_labels = np.unique(labels) + + # if the SVM predicts that all points in the cluster have + # the same label, just ignore it and don't reindex. + # this also helps us to handle the possibility that a + # cluster may be impure but inherently inseparable, e.g. an + # end cluster at a user's home, containing 50% trips from + # work to home and 50% round trips that start and end at + # home. we don't want to reindex otherwise the low purity + # will trigger SVM again, and we will attempt & fail to + # split the cluster ad infinitum + if len(unique_labels) > 1: + # map purpose labels to new cluster indices + # we offset indices by the max existing index so that we + # don't run into any duplicate indices + max_existing_idx = self.train_df[ + f'{self.loc_type}_cluster_idx'].max() + label_to_cluster = { + unique_labels[i]: i + max_existing_idx + 1 + for i in range(len(unique_labels)) + } + # update trips with their new cluster indices + indices = np.array( + [label_to_cluster[l] for l in labels]) + self.train_df.loc[ + self.train_df[f'{self.loc_type}_cluster_idx'] == c, + f'{self.loc_type}_cluster_idx'] = indices + + c += 1 + # TODO: make things categorical at the end? or maybe at the start of the decision tree pipeline + + return self + + def fit_predict(self, train_df): + """ Override to avoid unnecessarily computation of distance matrices. + """ + self.fit(train_df) + return self.train_df[[f'{self.loc_type}_cluster_idx']] + + def predict(self, test_df): + # TODO: store clusters as polygons so the prediction is faster + # TODO: we probably don't want to store test_df in self to be more memory-efficient + self.test_df = self._clean_data(test_df) + pred_clusters = self._NN_predict(self.test_df) + + self.test_df.loc[:, f'{self.loc_type}_cluster_idx'] = pred_clusters + + return self.test_df[[f'{self.loc_type}_cluster_idx']] + + def _NN_predict(self, test_df): + """ Generate base-cluster predictions for the test data using a + nearest-neighbor approach. + + sklearn doesn't implement predict() for DBSCAN, which is why we + need a custom method. + """ + n_samples = test_df.shape[0] + labels = np.ones(shape=n_samples, dtype=int) * -1 + + # get coordinates of core points (we can't use model.components_ + # because our input feature was a distance matrix and doesn't contain + # info about the raw coordinates) + # NOTE: technically, every single point in a cluster is a core point + # because it has at least minPts (2) points, including itself, in its + # radius + train_coordinates = self.train_df[[ + f'{self.loc_type}_lat', f'{self.loc_type}_lon' + ]] + train_radians = np.radians(train_coordinates) + + for idx, row in test_df.reset_index(drop=True).iterrows(): + # calculate the distances between the ith test data and all points, + # then find the index of the closest point. if the ith test data is + # within epsilon of the point, then assign its cluster to the ith + # test data (otherwise, leave it as -1, indicating noise). + # unfortunately, pairwise_distances_argmin() does not support + # haversine distance, so we have to reimplement it ourselves + new_loc_radians = np.radians( + row[[self.loc_type + "_lat", + self.loc_type + "_lon"]].to_list()) + new_loc_radians = np.reshape(new_loc_radians, (1, 2)) + dist_matrix_meters = haversine_distances( + new_loc_radians, train_radians) * EARTH_RADIUS + + shortest_dist_idx = np.argmin(dist_matrix_meters) + if dist_matrix_meters[0, shortest_dist_idx] < self.radius: + labels[idx] = self.train_df.reset_index( + drop=True).loc[shortest_dist_idx, + f'{self.loc_type}_cluster_idx'] + + return labels + + +###################### +## trip classifiers ## +###################### + + +class NaiveBinningClassifier(TripClassifier): + """ Trip classifier using the existing Similarity class and associated + functions without refactoring them. Essentially a wrapper for the + existing code on e-mission-server. + + Args: + radius (int): maximum distance between any two points in the same + cluster + """ + + def __init__(self, radius=500): + self.radius = radius + + def set_params(self, params): + if 'radius' in params.keys(): self.radius = params['radius'] + + return self + + def fit(self, train_df): + # (copied from bsm.build_user_model()) + + # convert train_df to a list because the existing binning algorithm + # only accepts lists of Entry objects + train_trips = self._trip_df_to_list(train_df) + + sim, bins, bin_trips, train_trips = ep.first_round( + train_trips, self.radius) + + # set instance variables so we can access results later as well + self.sim = sim + self.bins = bins + + # save all user labels + user_id = train_df.user_id.iloc[0] + bsm.save_models('user_labels', + bsm.create_user_input_map(train_trips, bins), user_id) + + # save location features of all bins + bsm.save_models('locations', + bsm.create_location_map(train_trips, bins), user_id) + return self + + def predict_proba(self, test_df): + """ NOTE: these class probabilities have the confidence-discounting + heuristic applied. + """ + # convert test_df to a list because the existing binning algorithm + # only accepts lists of Entry objects + test_trips = self._trip_df_to_list(test_df) + + purpose_distribs = [] + mode_distribs = [] + replaced_distribs = [] + + for trip in test_trips: + trip_prediction = predict_cluster_confidence_discounting(trip) + + if len(trip_prediction) == 0: + # model could not find cluster for the trip + purpose_distribs += [{}] + mode_distribs += [{}] + replaced_distribs += [{}] + + else: + trip_prediction_df = pd.DataFrame(trip_prediction).rename( + columns={'labels': 'user_input'}) + # renaming is simply so we can use the expand_userinputs + # function + + expand_prediction = esdtq.expand_userinputs(trip_prediction_df) + # converts the 'labels' dictionaries into individual columns + + # sum up probability for each label + for label_type, label_distribs in zip( + ['purpose_confirm', 'mode_confirm', 'replaced_mode'], + [purpose_distribs, mode_distribs, replaced_distribs]): + label_distrib = {} + if label_type in expand_prediction.columns: + for label in expand_prediction[label_type].unique(): + label_distrib[label] = expand_prediction.loc[ + expand_prediction[label_type] == label, + 'p'].sum() + label_distribs += [label_distrib] + + proba_dfs = [] + for label_type, label_distribs in zip( + ['purpose', 'mode', 'replaced'], + [purpose_distribs, mode_distribs, replaced_distribs]): + + proba = pd.DataFrame(label_distribs) + proba['clusterable'] = proba.sum(axis=1) > 0 + proba['top_pred'] = proba.drop(columns=['clusterable']).idxmax( + axis=1) + proba['top_proba'] = proba.drop( + columns=['clusterable', 'top_pred']).max(axis=1, skipna=True) + classes = proba.columns[:-3] + proba.loc[:, classes] = proba.loc[:, classes].fillna(0) + proba = pd.concat([proba], keys=[label_type], axis=1) + proba_dfs += [proba] + + self.proba_df = pd.concat(proba_dfs, axis=1) + return self.proba_df + + def _trip_df_to_list(self, trip_df): + """ Converts a dataframe of trips into a list of trip Entry objects. + + Allows this class to accept DataFrames (which are used by the new + clustering algorithms) without having to refactor the old + clustering algorithm. + + Args: + trip_df: DataFrame containing trips. See code below for the + expected columns. + + """ + trips_list = [] + + for idx, row in trip_df.iterrows(): + data = { + 'source': row['source'], + 'end_ts': row['end_ts'], + # 'end_local_dt':row['end_local_dt'], # this attribute doesn't seem to appear in the dataframes I've tested with + 'end_fmt_time': row['end_fmt_time'], + 'end_loc': row['end_loc'], + 'raw_trip': row['raw_trip'], + 'start_ts': row['start_ts'], + # 'start_local_dt':row['start_local_dt'], # this attribute doesn't seem to appear in the dataframes I've tested with + 'start_fmt_time': row['start_fmt_time'], + 'start_loc': row['start_loc'], + 'duration': row['duration'], + 'distance': row['distance'], + 'start_place': row['start_place'], + 'end_place': row['end_place'], + 'cleaned_trip': row['cleaned_trip'], + 'inferred_labels': row['inferred_labels'], + 'inferred_trip': row['inferred_trip'], + 'expectation': row['expectation'], + 'confidence_threshold': row['confidence_threshold'], + 'expected_trip': row['expected_trip'], + 'user_input': row['user_input'] + } + trip = ecwe.Entry.create_entry(user_id=row['user_id'], + key='analysis/confirmed_trip', + data=data) + trips_list += [trip] + + return trips_list + + +class ClusterExtrapolationClassifier(TripClassifier): + """ Classifier that extrapolates labels from a trip's cluster. + + Args: + alg (str): clustering algorithm to use; either 'DBSCAN' or 'naive' + radius (int): radius for the clustering algorithm + svm (bool): whether or not to use SVM sub-clustering. (only when + alg=='DBSCAN') + size_thresh (int): the min number of trips a cluster must have + to be considered for SVM sub-division + purity_thresh (float): the min purity a cluster must have + to be sub-divided using SVM + gamma (float): coefficient for the rbf kernel in SVM + C (float): regularization hyperparameter for SVM + cluster_method (str): 'end', 'trip', 'combination'. whether to + extrapolate labels from only end clusters, only trip clusters, + or both end and trip clusters when available. + """ + + def __init__( + self, + alg='DBSCAN', + radius=100, # TODO: add diff start and end radii + svm=True, + size_thresh=1, + purity_thresh=1.0, + gamma=0.05, + C=1, + cluster_method='end'): + assert cluster_method in ['end', 'trip', 'combination'] + assert alg in ['DBSCAN', 'naive'] + self.alg = alg + self.radius = radius + self.svm = svm + self.size_thresh = size_thresh + self.purity_thresh = purity_thresh + self.gamma = gamma + self.C = C + self.cluster_method = cluster_method + + if self.alg == 'DBSCAN': + self.end_cluster_model = DBSCANSVMCluster( + loc_type='end', + radius=self.radius, + svm=self.svm, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + elif self.alg == 'naive': + self.end_cluster_model = RefactoredNaiveCluster(loc_type='end', + radius=self.radius) + + if self.cluster_method in ['trip', 'combination']: + if self.alg == 'DBSCAN': + self.start_cluster_model = DBSCANSVMCluster( + loc_type='start', + radius=self.radius, + svm=self.svm, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + elif self.alg == 'naive': + self.start_cluster_model = RefactoredNaiveCluster( + loc_type='start', radius=self.radius) + + self.trip_grouper = TripGrouper( + start_cluster_col='start_cluster_idx', + end_cluster_col='end_cluster_idx') + + def set_params(self, params): + """ hacky code that mimics the set_params of an sklearn Estimator class + so that we can pass params during randomizedsearchCV + + Args: + params (dict): a dictionary where the keys are the parameter + names and the values are the parameter values + """ + alg = params['alg'] if 'alg' in params.keys() else self.alg + radius = params['radius'] if 'radius' in params.keys() else self.radius + svm = params['svm'] if 'svm' in params.keys() else self.svm + size_thresh = params['size_thresh'] if 'size_thresh' in params.keys( + ) else self.size_thresh + purity_thresh = params[ + 'purity_thresh'] if 'purity_thresh' in params.keys( + ) else self.purity_thresh + gamma = params['gamma'] if 'gamma' in params.keys() else self.gamma + C = params['C'] if 'C' in params.keys() else self.C + cluster_method = params[ + 'cluster_method'] if 'cluster_method' in params.keys( + ) else self.cluster_method + + # calling __init__ again is not good practice, I know... + self.__init__(alg, radius, svm, size_thresh, purity_thresh, gamma, C, + cluster_method) + + return self + + def fit(self, train_df): + # fit clustering model + self.end_cluster_model.fit(train_df) + self.train_df = self.end_cluster_model.train_df + + if self.cluster_method in ['trip', 'combination']: + self.start_cluster_model.fit(train_df) + self.train_df.loc[:, ['start_cluster_idx' + ]] = self.start_cluster_model.train_df[[ + 'start_cluster_idx' + ]] + + # create trip-level clusters + trip_cluster_idx = self.trip_grouper.fit_transform(self.train_df) + self.train_df.loc[:, 'trip_cluster_idx'] = trip_cluster_idx + + return self + + def predict_proba(self, test_df): + """ NOTE: these class probabilities do NOT have a + confidence-discounting heuristic applied. + """ + self.end_cluster_model.predict(test_df) + # store a copy of test_df for now (TODO: make this more efficient since + # the data is duplicated) + self.test_df = self.end_cluster_model.test_df + + if self.cluster_method in ['trip', 'combination']: + self.start_cluster_model.predict(test_df) + # append the start cluster indices + self.test_df.loc[:, [ + 'start_cluster_idx' + ]] = self.start_cluster_model.test_df.loc[:, ['start_cluster_idx']] + + # create trip-level clusters + trip_cluster_idx = self.trip_grouper.transform(self.test_df) + self.test_df.loc[:, 'trip_cluster_idx'] = trip_cluster_idx + + # extrapolate label distributions from cluster information + self.test_df.loc[:, [ + 'mode_distrib', 'purpose_distrib', 'replaced_distrib' + ]] = np.nan + + if self.cluster_method in ['end', 'trip']: + cluster_col = f'{self.cluster_method}_cluster_idx' + self.test_df = self._add_label_distributions( + self.test_df, cluster_col) + + else: # self.cluster_method == 'combination' + # try to get label distributions from trip-level clusters first, + # because trip-level clusters tend to be more homogenous and will + # yield more accurate predictions + self.test_df = self._add_label_distributions( + self.test_df, 'trip_cluster_idx') + + # for trips that have an empty label-distribution after the first + # pass using trip clusters, try to get a distribution from the + # destination cluster (this includes both trips that *don't* fall + # into a trip cluster, as well as trips that *do* fall into a trip + # cluster but are missing some/all categories of labels due to + # missing user inputs.) + + # fill in missing label-distributions by the label_type + # (we want to iterate by label_type rather than check cluster idx + # because it's possible that some trips in a trip-cluster have + # predictions for one label_type but not another) + for label_type in ['mode', 'purpose', 'replaced']: + self.test_df.loc[self.test_df[f'{label_type}_distrib'] == + {}] = self._add_label_distributions( + self.test_df.loc[ + self.test_df[f'{label_type}_distrib'] + == {}], + 'end_cluster_idx', + label_types=[label_type]) + + # create the dataframe of probabilities + proba_dfs = [] + for label_type in ['purpose', 'mode', 'replaced']: + classes = self.train_df[f'{label_type}_true'].dropna().unique() + proba = pd.DataFrame( + self.test_df[f'{label_type}_distrib'].to_list(), + columns=classes) + proba['top_pred'] = proba.idxmax(axis=1) + proba['top_proba'] = proba.max(axis=1, skipna=True) + proba['clusterable'] = self.test_df.end_cluster_idx >= 0 + proba.loc[:, classes] = proba.loc[:, classes].fillna(0) + proba = pd.concat([proba], keys=[label_type], axis=1) + proba_dfs += [proba] + + self.proba_df = pd.concat(proba_dfs, axis=1) + return self.proba_df + + def _add_label_distributions(self, + df, + cluster_col, + label_types=['mode', 'purpose', 'replaced']): + """ Add label distributions to a DataFrame. + + Args: + df (DataFrame): DataFrame containing a column of clusters + cluster_col (str): name of column in df containing clusters + label_types (str list): the categories of labels to retrieve + distributions for. + + Returns: + a DataFrame with additional columns in which the entries are + dictionaries containing label distributions. + """ + df = df.copy() # to avoid SettingWithCopyWarning + for c in df.loc[:, cluster_col].unique(): + labeled_trips_in_cluster = self.train_df.loc[ + self.train_df[cluster_col] == c] + unlabeled_trips_in_cluster = df.loc[df[cluster_col] == c] + + cluster_size = len(unlabeled_trips_in_cluster) + + for label_type in label_types: + assert label_type in ['mode', 'purpose', 'replaced'] + + # get distribution of label_type labels in this cluster + distrib = labeled_trips_in_cluster[ + f'{label_type}_true'].value_counts(normalize=True, + dropna=True).to_dict() + # TODO: add confidence discounting + + # update predictions + # convert the dict into a list of dicts to work around pandas + # thinking we're trying to insert information according to a + # key-value map + # TODO: this is the line throwing the set on slice warning + df.loc[df[cluster_col] == c, + f'{label_type}_distrib'] = [distrib] * cluster_size + + return df + + +class EnsembleClassifier(TripClassifier, metaclass=ABCMeta): + """ Template class for trip classifiers using ensemble algorithms. + + Required args: + loc_feature (str): 'coordinates' or 'cluster' + """ + base_features = [ + 'duration', + 'distance', + 'start_local_dt_year', + 'start_local_dt_month', + 'start_local_dt_day', + 'start_local_dt_hour', + # 'start_local_dt_minute', + 'start_local_dt_weekday', + 'end_local_dt_year', # most likely the same as the start year + 'end_local_dt_month', # most likely the same as the start month + 'end_local_dt_day', + 'end_local_dt_hour', + # 'end_local_dt_minute', + 'end_local_dt_weekday', + ] + targets = ['mode_true', 'purpose_true', 'replaced_true'] + + # required instance attributes + loc_feature = NotImplemented + purpose_enc = NotImplemented + mode_enc = NotImplemented + purpose_predictor = NotImplemented + mode_predictor = NotImplemented + replaced_predictor = NotImplemented + + # required methods + def fit(self, train_df): + # get location features + if self.loc_feature == 'cluster': + # fit clustering model(s) and one-hot encode their indices + # TODO: consolidate start/end_cluster_model in a single instance + # that has a location_type parameter in the fit() method + self.end_cluster_model.fit(train_df) + + clusters_to_encode = self.end_cluster_model.train_df[[ + 'end_cluster_idx' + ]].copy() # copy is to avoid SettingWithCopyWarning + + if self.use_start_clusters or self.use_trip_clusters: + self.start_cluster_model.fit(train_df) + + if self.use_start_clusters: + clusters_to_encode = pd.concat([ + clusters_to_encode, self.start_cluster_model.train_df[[ + 'start_cluster_idx' + ]] + ], + axis=1) + if self.use_trip_clusters: + start_end_clusters = pd.concat([ + self.end_cluster_model.train_df[['end_cluster_idx']], + self.start_cluster_model.train_df[[ + 'start_cluster_idx' + ]] + ], + axis=1) + trip_cluster_idx = self.trip_grouper.fit_transform( + start_end_clusters) + clusters_to_encode.loc[:, + 'trip_cluster_idx'] = trip_cluster_idx + + loc_features_df = self.cluster_enc.fit_transform( + clusters_to_encode.astype(int)) + + # clean the df again because we need it in the next step + # TODO: remove redundancy + self.train_df = self._clean_data(train_df) + + # TODO: move below code into a reusable function + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + else: # self.loc_feature == 'coordinates' + self.train_df = self._clean_data(train_df) + + # TODO: move below code into a reusable function + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + loc_features_df = self.train_df[[ + 'start_lon', 'start_lat', 'end_lon', 'end_lat' + ]] + + # prepare data for the ensemble classifiers + + # note that we want to use purpose data to aid our mode predictions, + # and use both purpose and mode data to aid our replaced-mode + # predictions + # thus, we want to one-hot encode the purpose and mode as data + # features, but also preserve an unencoded copy for the target columns + + # dataframe holding all features and targets + self.Xy_train = pd.concat([ + self.train_df[self.base_features + self.targets], loc_features_df + ], + axis=1) + + # encode purposes and modes + onehot_purpose_df = self.purpose_enc.fit_transform( + self.Xy_train[['purpose_true']], output_col_prefix='purpose') + onehot_mode_df = self.mode_enc.fit_transform( + self.Xy_train[['mode_true']], output_col_prefix='mode') + self.Xy_train = pd.concat( + [self.Xy_train, onehot_purpose_df, onehot_mode_df], axis=1) + + # for predicting purpose, drop encoded purpose and mode features, as + # well as all target labels + self.X_purpose = self.Xy_train.dropna(subset=['purpose_true']).drop( + labels=self.targets + self.purpose_enc.onehot_encoding_cols + + self.mode_enc.onehot_encoding_cols, + axis=1) + + # for predicting mode, we want to keep purpose data + self.X_mode = self.Xy_train.dropna(subset=['mode_true']).drop( + labels=self.targets + self.mode_enc.onehot_encoding_cols, axis=1) + + # for predicting replaced-mode, we want to keep purpose and mode data + self.X_replaced = self.Xy_train.dropna(subset=['replaced_true']).drop( + labels=self.targets, axis=1) + + self.y_purpose = self.Xy_train['purpose_true'].dropna() + self.y_mode = self.Xy_train['mode_true'].dropna() + self.y_replaced = self.Xy_train['replaced_true'].dropna() + + # fit classifiers + if len(self.X_purpose) > 0: + self.purpose_predictor.fit(self.X_purpose, self.y_purpose) + if len(self.X_mode) > 0: + self.mode_predictor.fit(self.X_mode, self.y_mode) + if len(self.X_replaced) > 0: + self.replaced_predictor.fit(self.X_replaced, self.y_replaced) + + return self + + def predict_proba(self, test_df): + """ NOTE: these class probabilities do NOT have a + confidence-discounting heuristic applied. + """ + ################ + ### get data ### + ################ + self.X_test_for_purpose = self._get_X_test_for_purpose(test_df) + + ######################## + ### make predictions ### + ######################## + # note that we want to use purpose data to aid our mode predictions, + # and use both purpose and mode data to aid our replaced-mode + # predictions + + # TODO: some of the code across the try and except blocks can be + # consolidated by considering one-hot encoding fully np.nan arrays + try: + purpose_proba_raw = self.purpose_predictor.predict_proba( + self.X_test_for_purpose) + purpose_proba = pd.DataFrame( + purpose_proba_raw, columns=self.purpose_predictor.classes_) + purpose_pred = purpose_proba.idxmax(axis=1) + + # update X_test with one-hot-encoded purpose predictions to aid + # mode predictor + # TODO: converting purpose_pred to a DataFrame feels super + # unnecessary, make this more efficient + onehot_purpose_df = self.purpose_enc.transform( + pd.DataFrame(purpose_pred).set_index( + self.X_test_for_purpose.index)) + self.X_test_for_mode = pd.concat( + [self.X_test_for_purpose, onehot_purpose_df], axis=1) + + mode_proba, replaced_proba = self._try_predict_proba_mode_replaced( + ) + + except NotFittedError as e: + # if we can't predict purpose, we can still try to predict mode and + # replaced-mode without one-hot encoding the purpose + + purpose_pred = np.full((len(self.X_test_for_purpose), ), np.nan) + purpose_proba_raw = np.full((len(self.X_test_for_purpose), 1), 0) + purpose_proba = pd.DataFrame(purpose_proba_raw, columns=[np.nan]) + + self.X_test_for_mode = self.X_test_for_purpose + mode_proba, replaced_proba = self._try_predict_proba_mode_replaced( + ) + + mode_pred = mode_proba.idxmax(axis=1) + replaced_pred = replaced_proba.idxmax(axis=1) + + if (purpose_pred.dtype == np.float64 and mode_pred.dtype == np.float64 + and replaced_pred.dtype == np.float64): + # this indicates that all the predictions are np.nan so none of the + # random forest classifiers were fitted + raise NotFittedError + + # TODO: move this to a Mixin for cluster-based predictors and use the + # 'cluster' column of the proba_df outputs + # if self.drop_unclustered: + # # TODO: actually, we should only drop purpose predictions. we can + # # then impute the missing entries in the purpose feature and still + # # try to predict mode and replaced-mode without it + # self.predictions.loc[ + # self.end_cluster_model.test_df['end_cluster_idx'] == -1, + # ['purpose_pred', 'mode_pred', 'replaced_pred']] = np.nan + + proba_dfs = [] + for label_type, proba in zip( + ['purpose', 'mode', 'replaced'], + [purpose_proba, mode_proba, replaced_proba]): + proba['top_pred'] = proba.idxmax(axis=1) + proba['top_proba'] = proba.max(axis=1, skipna=True) + proba['clusterable'] = self._clusterable( + self.X_test_for_purpose).astype(bool) + proba = pd.concat([proba], keys=[label_type], axis=1) + proba_dfs += [proba] + + self.proba_df = pd.concat(proba_dfs, axis=1) + return self.proba_df + + def _get_X_test_for_purpose(self, test_df): + """ Do the pre-processing to get data that we can then pass into the + ensemble classifiers. + """ + if self.loc_feature == 'cluster': + # get clusters + self.end_cluster_model.predict(test_df) + clusters_to_encode = self.end_cluster_model.test_df[[ + 'end_cluster_idx' + ]].copy() # copy is to avoid SettingWithCopyWarning + + if self.use_start_clusters or self.use_trip_clusters: + self.start_cluster_model.predict(test_df) + + if self.use_start_clusters: + clusters_to_encode = pd.concat([ + clusters_to_encode, + self.start_cluster_model.test_df[['start_cluster_idx']] + ], + axis=1) + if self.use_trip_clusters: + start_end_clusters = pd.concat([ + self.end_cluster_model.test_df[['end_cluster_idx']], + self.start_cluster_model.test_df[['start_cluster_idx']] + ], + axis=1) + trip_cluster_idx = self.trip_grouper.transform( + start_end_clusters) + clusters_to_encode.loc[:, + 'trip_cluster_idx'] = trip_cluster_idx + + # one-hot encode the cluster indices + loc_features_df = self.cluster_enc.transform(clusters_to_encode) + else: # self.loc_feature == 'coordinates' + test_df = self._clean_data(test_df) + loc_features_df = test_df[[ + 'start_lon', 'start_lat', 'end_lon', 'end_lat' + ]] + + # extract the desired data + X_test = pd.concat([ + test_df[self.base_features].reset_index(drop=True), + loc_features_df.reset_index(drop=True) + ], + axis=1) + + return X_test + + def _try_predict_proba_mode_replaced(self): + """ Try to predict mode and replaced-mode. Handles error in case the + ensemble algorithms were not fitted. + + Requires self.X_test_for_mode to have already been set. (These are + the DataFrames containing the test data to be passed into self. + mode_predictor.) + + Returns: mode_proba and replaced_proba, two DataFrames containing + class probabilities for mode and replaced-mode respectively + """ + + try: + # predict mode + mode_proba_raw = self.mode_predictor.predict_proba( + self.X_test_for_mode) + mode_proba = pd.DataFrame(mode_proba_raw, + columns=self.mode_predictor.classes_) + mode_pred = mode_proba.idxmax(axis=1) + + # update X_test with one-hot-encoded mode predictions to aid + # replaced-mode predictor + onehot_mode_df = self.mode_enc.transform( + pd.DataFrame(mode_pred).set_index(self.X_test_for_mode.index)) + self.X_test_for_replaced = pd.concat( + [self.X_test_for_mode, onehot_mode_df], axis=1) + replaced_proba = self._try_predict_proba_replaced() + + except NotFittedError as e: + mode_proba_raw = np.full((len(self.X_test_for_mode), 1), 0) + mode_proba = pd.DataFrame(mode_proba_raw, columns=[np.nan]) + + # if we don't have mode predictions, we *could* still try to + # predict replaced mode (but if the user didn't input mode labels + # then it's unlikely they would input replaced-mode) + self.X_test_for_replaced = self.X_test_for_mode + replaced_proba = self._try_predict_proba_replaced() + + return mode_proba, replaced_proba + + def _try_predict_proba_replaced(self): + """ Try to predict replaced mode. Handles error in case the + replaced_predictor was not fitted. + + Requires self.X_test_for_replaced to have already been set. (This + is the DataFrame containing the test data to be passed into self. + replaced_predictor.) + + Returns: replaced_proba, DataFrame containing class probabilities + for replaced-mode + """ + try: + replaced_proba_raw = self.replaced_predictor.predict_proba( + self.X_test_for_replaced + ) # has shape (len_trips, number of replaced_mode classes) + replaced_proba = pd.DataFrame( + replaced_proba_raw, columns=self.replaced_predictor.classes_) + + except NotFittedError as e: + replaced_proba_raw = np.full((len(self.X_test_for_replaced), 1), 0) + replaced_proba = pd.DataFrame(replaced_proba_raw, columns=[np.nan]) + + return replaced_proba + + def _clusterable(self, test_df): + """ Check if the end points can be clustered (i.e. are within + meters of an end point from the training set) + """ + if self.loc_feature == 'cluster': + return self.end_cluster_model.test_df.end_cluster_idx >= 0 + + n_samples = test_df.shape[0] + clustered = np.ones(shape=n_samples, dtype=int) * False + + train_coordinates = self.train_df[['end_lat', 'end_lon']] + train_radians = np.radians(train_coordinates) + + for idx, row in test_df.reset_index(drop=True).iterrows(): + # calculate the distances between the ith test data and all points, + # then find the minimum distance for each point and check if it's + # within the distance threshold. + # unfortunately, pairwise_distances_argmin() does not support + # haversine distance, so we have to reimplement it ourselves + new_loc_radians = np.radians(row[["end_lat", "end_lon"]].to_list()) + new_loc_radians = np.reshape(new_loc_radians, (1, 2)) + dist_matrix_meters = haversine_distances( + new_loc_radians, train_radians) * EARTH_RADIUS + + shortest_dist = np.min(dist_matrix_meters) + if shortest_dist < self.radius: + clustered[idx] = True + + return clustered + + +class ForestClassifier(EnsembleClassifier): + """ Random forest-based trip classifier. + + Args: + loc_feature (str): 'coordinates' or 'cluster'; whether to use lat/ + lon coordinates or cluster indices for the location feature + radius (int): radius for DBSCAN clustering. only if + loc_feature=='cluster' + size_thresh (int): the min number of trips a cluster must have to + be considered for sub-division via SVM. only if + loc_feature=='cluster' + purity_thresh (float): the min purity a cluster must have to be + sub-divided via SVM. only if loc_feature=='cluster' + gamma (float): coefficient for the rbf kernel in SVM. only if + loc_feature=='cluster' + C (float): regularization hyperparameter for SVM. only if + loc_feature=='cluster' + n_estimators (int): number of estimators in the random forest + criterion (str): function to measure the quality of a split in the + random forest + max_depth (int): max depth of a tree in the random forest. + unlimited if None. + min_samples_split (int): min number of samples required to split an + internal node in a decision tree + min_samples_leaf (int): min number of samples required for a leaf + node in a decision tree + max_features (str): number of features to consider when looking for + the best split in a decision tree + bootstrap (bool): whether bootstrap samples are used when building + decision trees + random_state (int): random state for deterministic random forest + construction + use_start_clusters (bool): whether or not to use start clusters as + input features to the ensemble classifier. only if + loc_feature=='cluster' + use_trip_clusters (bool): whether or not to use trip-level clusters + as input features to the ensemble classifier. only if + loc_feature=='cluster' + """ + + def __init__( + self, + loc_feature='coordinates', + radius=100, # TODO: add different start and end radii + size_thresh=1, + purity_thresh=1.0, + gamma=0.05, + C=1, + n_estimators=100, + criterion='gini', + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + max_features='sqrt', + bootstrap=True, + random_state=42, + # drop_unclustered=False, + use_start_clusters=False, + use_trip_clusters=True): + assert loc_feature in ['cluster', 'coordinates'] + self.loc_feature = loc_feature + self.radius = radius + self.size_thresh = size_thresh + self.purity_thresh = purity_thresh + self.gamma = gamma + self.C = C + self.n_estimators = n_estimators + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.max_features = max_features + self.bootstrap = bootstrap + self.random_state = random_state + # self.drop_unclustered = drop_unclustered + self.use_start_clusters = use_start_clusters + self.use_trip_clusters = use_trip_clusters + + if self.loc_feature == 'cluster': + # clustering algorithm to generate end clusters + self.end_cluster_model = DBSCANSVMCluster( + loc_type='end', + radius=self.radius, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + + if self.use_start_clusters or self.use_trip_clusters: + # clustering algorithm to generate start clusters + self.start_cluster_model = DBSCANSVMCluster( + loc_type='start', + radius=self.radius, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + + if self.use_trip_clusters: + # helper class to generate trip-level clusters + self.trip_grouper = TripGrouper( + start_cluster_col='start_cluster_idx', + end_cluster_col='end_cluster_idx') + + # wrapper class to generate one-hot encodings for cluster indices + self.cluster_enc = OneHotWrapper(sparse=False, + handle_unknown='ignore') + + # wrapper class to generate one-hot encodings for purposes and modes + self.purpose_enc = OneHotWrapper(impute_missing=True, + sparse=False, + handle_unknown='error') + self.mode_enc = OneHotWrapper(impute_missing=True, + sparse=False, + handle_unknown='error') + + # ensemble classifiers for each label category + self.purpose_predictor = RandomForestClassifier( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + bootstrap=self.bootstrap, + random_state=self.random_state) + self.mode_predictor = RandomForestClassifier( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + bootstrap=self.bootstrap, + random_state=self.random_state) + self.replaced_predictor = RandomForestClassifier( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + bootstrap=self.bootstrap, + random_state=self.random_state) + + def set_params(self, params): + """ hacky code that mimics the set_params of an sklearn Estimator class + so that we can pass params during randomizedsearchCV + + Args: + params (dict): a dictionary where the keys are the parameter + names and the values are the parameter values + """ + loc_feature = params['loc_feature'] if 'loc_feature' in params.keys( + ) else self.loc_feature + radius = params['radius'] if 'radius' in params.keys() else self.radius + size_thresh = params['size_thresh'] if 'size_thresh' in params.keys( + ) else self.size_thresh + purity_thresh = params[ + 'purity_thresh'] if 'purity_thresh' in params.keys( + ) else self.purity_thresh + gamma = params['gamma'] if 'gamma' in params.keys() else self.gamma + C = params['C'] if 'C' in params.keys() else self.C + n_estimators = params['n_estimators'] if 'n_estimators' in params.keys( + ) else self.n_estimators + criterion = params['criterion'] if 'criterion' in params.keys( + ) else self.criterion + max_depth = params['max_depth'] if 'max_depth' in params.keys( + ) else self.max_depth + min_samples_split = params[ + 'min_samples_split'] if 'min_samples_split' in params.keys( + ) else self.min_samples_split + min_samples_leaf = params[ + 'min_samples_leaf'] if 'min_samples_leaf' in params.keys( + ) else self.min_samples_leaf + max_features = params['max_features'] if 'max_features' in params.keys( + ) else self.max_features + bootstrap = params['bootstrap'] if 'bootstrap' in params.keys( + ) else self.bootstrap + random_state = params['random_state'] if 'random_state' in params.keys( + ) else self.random_state + use_start_clusters = params[ + 'use_start_clusters'] if 'use_start_clusters' in params.keys( + ) else self.use_start_clusters + # drop_unclustered = params[ + # 'drop_unclustered'] if 'drop_unclustered' in params.keys( + # ) else self.drop_unclustered + use_trip_clusters = params[ + 'use_trip_clusters'] if 'use_trip_clusters' in params.keys( + ) else self.use_trip_clusters + + # yes, calling __init__ again is not good practice... + self.__init__(loc_feature, radius, size_thresh, purity_thresh, gamma, + C, n_estimators, criterion, max_depth, min_samples_split, + min_samples_leaf, max_features, bootstrap, random_state, + use_start_clusters, use_trip_clusters) + return self + + +class ClusterForestSlimPredictor(ForestClassifier): + """ This is the same as ForestClassifier, just with fewer base + features. + + Args: + loc_feature (str): 'coordinates' or 'cluster'; whether to use lat/ + lon coordinates or cluster indices for the location feature + radius (int): radius for DBSCAN clustering. only if + loc_feature=='cluster' + size_thresh (int): the min number of trips a cluster must have to + be considered for sub-division via SVM. only if + loc_feature=='cluster' + purity_thresh (float): the min purity a cluster must have to be + sub-divided via SVM. only if loc_feature=='cluster' + gamma (float): coefficient for the rbf kernel in SVM. only if + loc_feature=='cluster' + C (float): regularization hyperparameter for SVM. only if + loc_feature=='cluster' + n_estimators (int): number of estimators in the random forest + criterion (str): function to measure the quality of a split in the + random forest + max_depth (int): max depth of a tree in the random forest. + unlimited if None. + min_samples_split (int): min number of samples required to split an + internal node in a decision tree + min_samples_leaf (int): min number of samples required for a leaf + node in a decision tree + max_features (str): number of features to consider when looking for + the best split in a decision tree + bootstrap (bool): whether bootstrap samples are used when building + decision trees + random_state (int): random state for deterministic random forest + construction + use_start_clusters (bool): whether or not to use start clusters as + input features to the ensemble classifier. only if + loc_feature=='cluster' + use_trip_clusters (bool): whether or not to use trip-level clusters + as input features to the ensemble classifier. only if + loc_feature=='cluster' + """ + + def __init__( + self, + loc_feature='coordinates', + radius=100, # TODO: add different start and end radii + size_thresh=1, + purity_thresh=1.0, + gamma=0.05, + C=1, + n_estimators=100, + criterion='gini', + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + max_features='sqrt', + bootstrap=True, + random_state=42, + # drop_unclustered=False, + use_start_clusters=False, + use_trip_clusters=True): + + super().__init__(loc_feature, radius, size_thresh, purity_thresh, + gamma, C, n_estimators, criterion, max_depth, + min_samples_split, min_samples_leaf, max_features, + bootstrap, random_state, use_start_clusters, + use_trip_clusters) + + self.base_features = [ + 'duration', + 'distance', + ] + + +class AdaBoostClassifier(EnsembleClassifier): + """ AdaBoost-based trip classifier. + + Args: + loc_feature (str): 'coordinates' or 'cluster'; whether to use lat/ + lon coordinates or cluster indices for the location feature + radius (int): radius for DBSCAN clustering. only if + loc_feature=='cluster' + size_thresh (int): the min number of trips a cluster must have to + be considered for sub-division via SVM. only if + loc_feature=='cluster' + purity_thresh (float): the min purity a cluster must have to be + sub-divided via SVM. only if loc_feature=='cluster' + gamma (float): coefficient for the rbf kernel in SVM. only if + loc_feature=='cluster' + C (float): regularization hyperparameter for SVM. only if + loc_feature=='cluster' + n_estimators (int): number of estimators + criterion (str): function to measure the quality of a split in a + decision tree + max_depth (int): max depth of a tree in the random forest. + unlimited if None. + min_samples_split (int): min number of samples required to split an + internal node in a decision tree + min_samples_leaf (int): min number of samples required for a leaf + node in a decision tree + max_features (str): number of features to consider when looking for + the best split in a decision tree + random_state (int): random state for deterministic random forest + construction + use_start_clusters (bool): whether or not to use start clusters as + input features to the ensemble classifier. only if + loc_feature=='cluster' + use_trip_clusters (bool): whether or not to use trip-level clusters + as input features to the ensemble classifier. only if + loc_feature=='cluster' + learning_rate (float): weight applied to each decision tree at each + boosting iteration + """ + + def __init__( + self, + loc_feature='coordinates', + radius=100, # TODO: add different start and end radii + size_thresh=1, + purity_thresh=1.0, + gamma=0.05, + C=1, + n_estimators=100, + criterion='gini', + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + max_features='sqrt', + random_state=42, + # drop_unclustered=False, + use_start_clusters=False, + use_trip_clusters=True, + use_base_clusters=True, + learning_rate=1.0): + assert loc_feature in ['cluster', 'coordinates'] + self.loc_feature = loc_feature + self.radius = radius + self.size_thresh = size_thresh + self.purity_thresh = purity_thresh + self.gamma = gamma + self.C = C + self.n_estimators = n_estimators + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.max_features = max_features + self.random_state = random_state + # self.drop_unclustered = drop_unclustered + self.use_start_clusters = use_start_clusters + self.use_trip_clusters = use_trip_clusters + self.use_base_clusters = use_base_clusters + self.learning_rate = learning_rate + + if self.loc_feature == 'cluster': + # clustering algorithm to generate end clusters + self.end_cluster_model = DBSCANSVMCluster( + loc_type='end', + radius=self.radius, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + + if self.use_start_clusters or self.use_trip_clusters: + # clustering algorithm to generate start clusters + self.start_cluster_model = DBSCANSVMCluster( + loc_type='start', + radius=self.radius, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + + if self.use_trip_clusters: + # helper class to generate trip-level clusters + self.trip_grouper = TripGrouper( + start_cluster_col='start_cluster_idx', + end_cluster_col='end_cluster_idx') + + # wrapper class to generate one-hot encodings for cluster indices + self.cluster_enc = OneHotWrapper(sparse=False, + handle_unknown='ignore') + + # wrapper class to generate one-hot encodings for purposes and modes + self.purpose_enc = OneHotWrapper(impute_missing=True, + sparse=False, + handle_unknown='error') + self.mode_enc = OneHotWrapper(impute_missing=True, + sparse=False, + handle_unknown='error') + + self.purpose_predictor = AdaBoostClassifier( + n_estimators=self.n_estimators, + learning_rate=self.learning_rate, + random_state=self.random_state, + base_estimator=DecisionTreeClassifier( + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + random_state=self.random_state)) + self.mode_predictor = AdaBoostClassifier( + n_estimators=self.n_estimators, + learning_rate=self.learning_rate, + random_state=self.random_state, + base_estimator=DecisionTreeClassifier( + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + random_state=self.random_state)) + self.replaced_predictor = AdaBoostClassifier( + n_estimators=self.n_estimators, + learning_rate=self.learning_rate, + random_state=self.random_state, + base_estimator=DecisionTreeClassifier( + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + random_state=self.random_state)) + + def set_params(self, params): + """ hacky code that mimics the set_params of an sklearn Estimator class + so that we can pass params during randomizedsearchCV + + Args: + params (dict): a dictionary where the keys are the parameter + names and the values are the parameter values + """ + radius = params['radius'] if 'radius' in params.keys() else self.radius + size_thresh = params['size_thresh'] if 'size_thresh' in params.keys( + ) else self.size_thresh + purity_thresh = params[ + 'purity_thresh'] if 'purity_thresh' in params.keys( + ) else self.purity_thresh + gamma = params['gamma'] if 'gamma' in params.keys() else self.gamma + C = params['C'] if 'C' in params.keys() else self.C + n_estimators = params['n_estimators'] if 'n_estimators' in params.keys( + ) else self.n_estimators + criterion = params['criterion'] if 'criterion' in params.keys( + ) else self.criterion + max_depth = params['max_depth'] if 'max_depth' in params.keys( + ) else self.max_depth + min_samples_split = params[ + 'min_samples_split'] if 'min_samples_split' in params.keys( + ) else self.min_samples_split + min_samples_leaf = params[ + 'min_samples_leaf'] if 'min_samples_leaf' in params.keys( + ) else self.min_samples_leaf + max_features = params['max_features'] if 'max_features' in params.keys( + ) else self.max_features + random_state = params['random_state'] if 'random_state' in params.keys( + ) else self.random_state + use_start_clusters = params[ + 'use_start_clusters'] if 'use_start_clusters' in params.keys( + ) else self.use_start_clusters + # drop_unclustered = params[ + # 'drop_unclustered'] if 'drop_unclustered' in params.keys( + # ) else self.drop_unclustered + use_trip_clusters = params[ + 'use_trip_clusters'] if 'use_trip_clusters' in params.keys( + ) else self.use_trip_clusters + learning_rate = params[ + 'learning_rate'] if 'learning_rate' in params.keys( + ) else self.learning_rate + + # calling __init__ again is not good practice, I know... + self.__init__(radius, size_thresh, purity_thresh, gamma, C, + n_estimators, criterion, max_depth, min_samples_split, + min_samples_leaf, max_features, random_state, + use_start_clusters, use_trip_clusters, learning_rate) + return self + + +class TripGrouper(): + """ Helper class to get trip clusters from start and end clusters. + + Args: + start_cluster_col (str): name of the column containing start + cluster indices + end_cluster_col (str): name of the column containing end cluster + indices + """ + + def __init__(self, + start_cluster_col='start_cluster_idx', + end_cluster_col='end_cluster_idx'): + self.start_cluster_col = start_cluster_col + self.end_cluster_col = end_cluster_col + + def fit_transform(self, trip_df): + """ Fit and remember possible trip clusters. + + Args: + trip_df (DataFrame): DataFrame containing trips. must have + columns and + """ + trip_groups = trip_df.groupby( + [self.start_cluster_col, self.end_cluster_col]) + + # need dict so we can access the trip indices of all the trips in each + # group. the key is the group tuple and the value is the list of trip + # indices in the group. + self.trip_groups_dict = dict(trip_groups.groups) + + # we want to convert trip-group tuples to to trip-cluster indices, + # hence the pd Series + trip_groups_series = pd.Series(list(self.trip_groups_dict.keys())) + + trip_cluster_idx = np.empty(len(trip_df)) + + for group_idx in range(len(trip_groups_series)): + group_tuple = trip_groups_series[group_idx] + trip_idxs_in_group = self.trip_groups_dict[group_tuple] + trip_cluster_idx[trip_idxs_in_group] = group_idx + + return trip_cluster_idx + + def transform(self, new_trip_df): + """ Get trip clusters for a new set of trips. + + Args: + new_trip_df (DataFrame): DataFrame containing trips. must have + columns and + """ + prediction_trip_groups = new_trip_df.groupby( + [self.start_cluster_col, self.end_cluster_col]) + + # need dict so we can access the trip indices of all the trips in each + # group. the key is the group tuple and the value is the list of trip + # indices in the group. + prediction_trip_groups_dict = dict(prediction_trip_groups.groups) + trip_groups_series = pd.Series(list(self.trip_groups_dict.keys())) + trip_cluster_idx = np.empty(len(new_trip_df)) + + for group_tuple in dict(prediction_trip_groups.groups).keys(): + # check if the trip cluster exists in the training set + trip_idxs_in_group = prediction_trip_groups_dict[group_tuple] + if group_tuple in self.trip_groups_dict.keys(): + # look up the group index from the series we created when we + # fit the model + group_idx = trip_groups_series[trip_groups_series == + group_tuple].index[0] + else: + group_idx = -1 + + trip_cluster_idx[trip_idxs_in_group] = group_idx + + return trip_cluster_idx + + +class OneHotWrapper(): + """ Helper class to streamline one-hot encoding. + + Args: + impute_missing (bool): whether or not to impute np.nan values. + sparse (bool): whether or not to return a sparse matrix. + handle_unknown (str): specifies the way unknown categories are + handled during transform. + """ + + def __init__( + self, + impute_missing=False, + sparse=False, + handle_unknown='ignore', + ): + self.impute_missing = impute_missing + if self.impute_missing: + self.encoder = make_pipeline( + SimpleImputer(missing_values=np.nan, + strategy='constant', + fill_value='missing'), + OneHotEncoder(sparse=False, handle_unknown=handle_unknown)) + else: + self.encoder = OneHotEncoder(sparse=sparse, + handle_unknown=handle_unknown) + + def fit_transform(self, train_df, output_col_prefix=None): + """ Establish one-hot encoded variables. + + Args: + train_df (DataFrame): DataFrame containing train trips. + output_col_prefix (str): only if train_df is a single column + """ + # TODO: handle pd series + + train_df = train_df.copy() # to avoid SettingWithCopyWarning + + # if imputing, the dtype of each column must be string/object and not + # numerical, otherwise the SimpleImputer will fail + if self.impute_missing: + for col in train_df.columns: + train_df[col] = train_df[col].astype(object) + onehot_encoding = self.encoder.fit_transform(train_df) + self.onehot_encoding_cols_all = [] + for col in train_df.columns: + if train_df.shape[1] > 1 or output_col_prefix is None: + output_col_prefix = col + self.onehot_encoding_cols_all += [ + f'{output_col_prefix}_{val}' + for val in np.sort(train_df[col].dropna().unique()) + ] + # we handle np.nan separately because it is of type float, and may + # cause issues with np.sort if the rest of the unique values are + # strings + if any((train_df[col].isna())): + self.onehot_encoding_cols_all += [f'{output_col_prefix}_nan'] + + onehot_encoding_df = pd.DataFrame( + onehot_encoding, + columns=self.onehot_encoding_cols_all).set_index(train_df.index) + + # ignore the encoded columns for missing entries + self.onehot_encoding_cols = copy.deepcopy( + self.onehot_encoding_cols_all) + for col in self.onehot_encoding_cols_all: + if col.endswith('_nan'): + onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) + self.onehot_encoding_cols.remove(col) + + return onehot_encoding_df.astype(int) + + def transform(self, test_df): + """ One-hot encoded features in accordance with features seen in the + train set. + + Args: + test_df (DataFrame): DataFrame of trips. + """ + # TODO: rename test_df, this one doesn't necessarily need to be a df + onehot_encoding = self.encoder.transform(test_df) + onehot_encoding_df = pd.DataFrame( + onehot_encoding, + columns=self.onehot_encoding_cols_all).set_index(test_df.index) + + # ignore the encoded columns for missing entries + for col in self.onehot_encoding_cols_all: + if col.endswith('_nan'): + onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) + + return onehot_encoding_df.astype(int) \ No newline at end of file From 9d6a1af0665e05d5527d1a325f966e027a23ee77 Mon Sep 17 00:00:00 2001 From: Hannah Lu Date: Fri, 16 Dec 2022 09:20:30 -0800 Subject: [PATCH 02/28] update user uuid lookup; add documentation note --- TRB_label_assist/models.py | 42 ++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/TRB_label_assist/models.py b/TRB_label_assist/models.py index e5283d730..b370878f3 100644 --- a/TRB_label_assist/models.py +++ b/TRB_label_assist/models.py @@ -24,6 +24,8 @@ from emission.analysis.classification.inference.labels.inferrers import predict_cluster_confidence_discounting import emission.core.wrapper.entry as ecwe import emission.analysis.modelling.tour_model_extended.similarity as eamts +# NOTE: tour_model_extended.similarity is on the +# eval-private-data-compatibility branch in e-mission-server # logging.basicConfig(level=logging.DEBUG) @@ -599,8 +601,7 @@ def _NN_predict(self, test_df): # unfortunately, pairwise_distances_argmin() does not support # haversine distance, so we have to reimplement it ourselves new_loc_radians = np.radians( - row[[self.loc_type + "_lat", - self.loc_type + "_lon"]].to_list()) + row[[self.loc_type + "_lat", self.loc_type + "_lon"]].to_list()) new_loc_radians = np.reshape(new_loc_radians, (1, 2)) dist_matrix_meters = haversine_distances( new_loc_radians, train_radians) * EARTH_RADIUS @@ -657,8 +658,8 @@ def fit(self, train_df): bsm.create_user_input_map(train_trips, bins), user_id) # save location features of all bins - bsm.save_models('locations', - bsm.create_location_map(train_trips, bins), user_id) + bsm.save_models('locations', bsm.create_location_map(train_trips, bins), + user_id) return self def predict_proba(self, test_df): @@ -1053,17 +1054,14 @@ def fit(self, train_df): if self.use_start_clusters: clusters_to_encode = pd.concat([ - clusters_to_encode, self.start_cluster_model.train_df[[ - 'start_cluster_idx' - ]] + clusters_to_encode, + self.start_cluster_model.train_df[['start_cluster_idx']] ], axis=1) if self.use_trip_clusters: start_end_clusters = pd.concat([ self.end_cluster_model.train_df[['end_cluster_idx']], - self.start_cluster_model.train_df[[ - 'start_cluster_idx' - ]] + self.start_cluster_model.train_df[['start_cluster_idx']] ], axis=1) trip_cluster_idx = self.trip_grouper.fit_transform( @@ -1120,10 +1118,9 @@ def fit(self, train_df): # features, but also preserve an unencoded copy for the target columns # dataframe holding all features and targets - self.Xy_train = pd.concat([ - self.train_df[self.base_features + self.targets], loc_features_df - ], - axis=1) + self.Xy_train = pd.concat( + [self.train_df[self.base_features + self.targets], loc_features_df], + axis=1) # encode purposes and modes onehot_purpose_df = self.purpose_enc.fit_transform( @@ -1197,8 +1194,7 @@ def predict_proba(self, test_df): self.X_test_for_mode = pd.concat( [self.X_test_for_purpose, onehot_purpose_df], axis=1) - mode_proba, replaced_proba = self._try_predict_proba_mode_replaced( - ) + mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() except NotFittedError as e: # if we can't predict purpose, we can still try to predict mode and @@ -1209,8 +1205,7 @@ def predict_proba(self, test_df): purpose_proba = pd.DataFrame(purpose_proba_raw, columns=[np.nan]) self.X_test_for_mode = self.X_test_for_purpose - mode_proba, replaced_proba = self._try_predict_proba_mode_replaced( - ) + mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() mode_pred = mode_proba.idxmax(axis=1) replaced_pred = replaced_proba.idxmax(axis=1) @@ -1580,8 +1575,8 @@ def set_params(self, params): ) else self.use_trip_clusters # yes, calling __init__ again is not good practice... - self.__init__(loc_feature, radius, size_thresh, purity_thresh, gamma, - C, n_estimators, criterion, max_depth, min_samples_split, + self.__init__(loc_feature, radius, size_thresh, purity_thresh, gamma, C, + n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap, random_state, use_start_clusters, use_trip_clusters) return self @@ -1648,8 +1643,8 @@ def __init__( use_start_clusters=False, use_trip_clusters=True): - super().__init__(loc_feature, radius, size_thresh, purity_thresh, - gamma, C, n_estimators, criterion, max_depth, + super().__init__(loc_feature, radius, size_thresh, purity_thresh, gamma, + C, n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap, random_state, use_start_clusters, use_trip_clusters) @@ -2005,8 +2000,7 @@ def fit_transform(self, train_df, output_col_prefix=None): columns=self.onehot_encoding_cols_all).set_index(train_df.index) # ignore the encoded columns for missing entries - self.onehot_encoding_cols = copy.deepcopy( - self.onehot_encoding_cols_all) + self.onehot_encoding_cols = copy.deepcopy(self.onehot_encoding_cols_all) for col in self.onehot_encoding_cols_all: if col.endswith('_nan'): onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) From 9bd9b18b8561d4bd3740da1ed26b9acf741f1593 Mon Sep 17 00:00:00 2001 From: Shankari Date: Tue, 14 Feb 2023 22:10:09 -0800 Subject: [PATCH 03/28] Add additional logging to the calculation so that we can monitor the result generation more carefully --- TRB_label_assist/models.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/TRB_label_assist/models.py b/TRB_label_assist/models.py index b370878f3..6f02277ce 100644 --- a/TRB_label_assist/models.py +++ b/TRB_label_assist/models.py @@ -283,6 +283,7 @@ class RefactoredNaiveCluster(Cluster): """ def __init__(self, loc_type='end', radius=100): + logging.info("PERF: Initializing RefactoredNaiveCluster") self.loc_type = loc_type self.radius = radius @@ -294,6 +295,7 @@ def set_params(self, params): def fit(self, train_df): # clean data + logging.info("PERF: Fitting RefactoredNaiveCluster with size %s" % len(train_df)) self.train_df = self._clean_data(train_df) # we can use all trips as long as they have purpose labels. it's ok if @@ -328,6 +330,7 @@ def fit(self, train_df): return self def predict(self, test_df): + logging.info("PERF: Predicting RefactoredNaiveCluster for %s" % len(test_df)) self.test_df = self._clean_data(test_df) if self.loc_type == 'start': @@ -339,6 +342,8 @@ def predict(self, test_df): # for each trip in the test list: for idx, row in self.test_df.iterrows(): + if idx % 100 == 0: + logging.info("PERF: RefactoredNaiveCluster Working on trip %s/%s" % (idx, len(self.test_df))) # iterate over all bins trip_binned = False for i, bin in enumerate(bins): @@ -418,6 +423,7 @@ def __init__(self, purity_thresh=1.0, gamma=0.05, C=1): + logging.info("PERF: Initializing DBSCANSVMCluster") self.loc_type = loc_type self.radius = radius self.svm = svm @@ -453,6 +459,7 @@ def fit(self, train_df): ################## ### clean data ### ################## + logging.info("PERF: Fitting DBSCANSVMCluster") self.train_df = self._clean_data(train_df) # we can use all trips as long as they have purpose labels. it's ok if @@ -563,6 +570,7 @@ def fit_predict(self, train_df): return self.train_df[[f'{self.loc_type}_cluster_idx']] def predict(self, test_df): + logging.info("PERF: Predicting DBSCANSVMCluster") # TODO: store clusters as polygons so the prediction is faster # TODO: we probably don't want to store test_df in self to be more memory-efficient self.test_df = self._clean_data(test_df) @@ -579,6 +587,7 @@ def _NN_predict(self, test_df): sklearn doesn't implement predict() for DBSCAN, which is why we need a custom method. """ + logging.info("PERF: NN_predicting DBSCANSVMCluster") n_samples = test_df.shape[0] labels = np.ones(shape=n_samples, dtype=int) * -1 @@ -631,6 +640,7 @@ class NaiveBinningClassifier(TripClassifier): """ def __init__(self, radius=500): + logging.info("PERF: Initializing NaiveBinningClassifier") self.radius = radius def set_params(self, params): @@ -639,6 +649,7 @@ def set_params(self, params): return self def fit(self, train_df): + logging.info("PERF: Fitting NaiveBinningClassifier") # (copied from bsm.build_user_model()) # convert train_df to a list because the existing binning algorithm @@ -668,6 +679,7 @@ def predict_proba(self, test_df): """ # convert test_df to a list because the existing binning algorithm # only accepts lists of Entry objects + logging.info("PERF: Predicting NaiveBinningClassifier") test_trips = self._trip_df_to_list(test_df) purpose_distribs = [] @@ -2026,4 +2038,4 @@ def transform(self, test_df): if col.endswith('_nan'): onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) - return onehot_encoding_df.astype(int) \ No newline at end of file + return onehot_encoding_df.astype(int) From 1b9ece091e20a8fe7b94939ecd56821c4812c1c2 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Tue, 22 Aug 2023 16:11:51 -0400 Subject: [PATCH 04/28] making `cluster_performance.ipynb`, `generate_figs_for_poster` and `SVM_decision_boundaries` compatible with changes in `clustering.py` and `mapping.py` files. Also porting these 3 notebooks to trip_model `cluster_performance.ipynb`, `generate_figs_for_poster` and `SVM_decision_boundaries` now have no dependence on the custom branch. Results of plots are attached to show no difference in theie previous and current outputs. --- TRB_label_assist/models.py | 43 +++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/TRB_label_assist/models.py b/TRB_label_assist/models.py index 6f02277ce..54cd3101f 100644 --- a/TRB_label_assist/models.py +++ b/TRB_label_assist/models.py @@ -23,7 +23,8 @@ import emission.analysis.modelling.tour_model_first_only.evaluation_pipeline as ep from emission.analysis.classification.inference.labels.inferrers import predict_cluster_confidence_discounting import emission.core.wrapper.entry as ecwe -import emission.analysis.modelling.tour_model_extended.similarity as eamts +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg +import clustering # NOTE: tour_model_extended.similarity is on the # eval-private-data-compatibility branch in e-mission-server @@ -293,7 +294,7 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,ct_entry): # clean data logging.info("PERF: Fitting RefactoredNaiveCluster with size %s" % len(train_df)) self.train_df = self._clean_data(train_df) @@ -315,17 +316,23 @@ def fit(self, train_df): if len(self.train_df) == 0: # i.e. no valid trips after removing all nans raise Exception('no valid trips; nothing to fit') - + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": self.radius, # meters, + "apply_cutoff": False, + "clustering_way":'origin' if self.loc_type=='start' + else 'destination' if self.loc_type =='end' + else 'origin-destination', + "incremental_evaluation": False + } + # fit the bins - self.sim_model = eamts.Similarity(self.train_df, - radius_start=self.radius, - radius_end=self.radius, - shouldFilter=False, - cutoff=False) - # we only bin the loc_type points to speed up the alg. avoid - # unnecessary binning since this is really slow - self.sim_model.bin_helper(loc_type=self.loc_type) - labels = self.sim_model.data_df[self.loc_type + '_bin'].to_list() + self.sim_model= eamtg.GreedySimilarityBinning(model_config) + cleaned_trip_entry= clustering.cleanEntryTypeData(self.train_df,ct_entry) + self.sim_model.fit(cleaned_trip_entry) + + labels = [int(l) for l in self.sim_model.tripLabels] self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = labels return self @@ -880,13 +887,19 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,ct_entry): # fit clustering model - self.end_cluster_model.fit(train_df) + if self.__class__.__name__ == 'RefactoredNaiveCluster': + self.end_cluster_model.fit(train_df,ct_entry) + else: + self.end_cluster_model.fit(train_df) self.train_df = self.end_cluster_model.train_df if self.cluster_method in ['trip', 'combination']: - self.start_cluster_model.fit(train_df) + if self.__class__.__name__ == 'RefactoredNaiveCluster': + self.start_cluster_model.fit(train_df,ct_entry) + else: + self.start_cluster_model.fit(train_df) self.train_df.loc[:, ['start_cluster_idx' ]] = self.start_cluster_model.train_df[[ 'start_cluster_idx' From e7d2a14d9172293d894ffac894eaee8df8adb49c Mon Sep 17 00:00:00 2001 From: $aTyam Date: Sat, 26 Aug 2023 02:46:09 -0400 Subject: [PATCH 05/28] Unified Interface for fit function Unified Interface for fit function across all models. Passing 'Entry' Type data from the notebooks till the Binning functions. Default set to 'none'. --- TRB_label_assist/models.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/TRB_label_assist/models.py b/TRB_label_assist/models.py index 54cd3101f..5bca3b31f 100644 --- a/TRB_label_assist/models.py +++ b/TRB_label_assist/models.py @@ -117,11 +117,12 @@ class Cluster(SetupMixin, metaclass=ABCMeta): """ blueprint for clustering models. """ @abstractmethod - def fit(self, train_df): + def fit(self, train_df,ct_entry=None): """ Fit the clustering algorithm. Args: train_df (DataFrame): dataframe of labeled trips + ct_entry (List) : A list of Entry type of labeled and unlabeled trips Returns: self @@ -160,11 +161,12 @@ def fit_predict(self, train_df): class TripClassifier(SetupMixin, metaclass=ABCMeta): @abstractmethod - def fit(self, train_df): + def fit(self, train_df,ct_entry=None): """ Fit a classification model. Args: train_df (DataFrame): dataframe of labeled trips + ct_entry (List) : A list of Entry type of labeled and unlabeled trips Returns: self @@ -294,7 +296,7 @@ def set_params(self, params): return self - def fit(self, train_df,ct_entry): + def fit(self, train_df,ct_entry=None): # clean data logging.info("PERF: Fitting RefactoredNaiveCluster with size %s" % len(train_df)) self.train_df = self._clean_data(train_df) @@ -451,7 +453,7 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,ct_entry=None): """ Creates clusters of trip points. self.train_df will be updated with columns containing base and final clusters. @@ -462,6 +464,7 @@ def fit(self, train_df): Args: train_df (dataframe): dataframe of labeled trips + ct_entry (List) : A list of Entry type of labeled and unlabeled trips """ ################## ### clean data ### @@ -655,7 +658,7 @@ def set_params(self, params): return self - def fit(self, train_df): + def fit(self, train_df,ct_entry=None): logging.info("PERF: Fitting NaiveBinningClassifier") # (copied from bsm.build_user_model()) @@ -887,19 +890,13 @@ def set_params(self, params): return self - def fit(self, train_df,ct_entry): + def fit(self, train_df,ct_entry=None): # fit clustering model - if self.__class__.__name__ == 'RefactoredNaiveCluster': - self.end_cluster_model.fit(train_df,ct_entry) - else: - self.end_cluster_model.fit(train_df) + self.end_cluster_model.fit(train_df,ct_entry) self.train_df = self.end_cluster_model.train_df if self.cluster_method in ['trip', 'combination']: - if self.__class__.__name__ == 'RefactoredNaiveCluster': - self.start_cluster_model.fit(train_df,ct_entry) - else: - self.start_cluster_model.fit(train_df) + self.start_cluster_model.fit(train_df,ct_entry) self.train_df.loc[:, ['start_cluster_idx' ]] = self.start_cluster_model.train_df[[ 'start_cluster_idx' @@ -1062,7 +1059,7 @@ class EnsembleClassifier(TripClassifier, metaclass=ABCMeta): replaced_predictor = NotImplemented # required methods - def fit(self, train_df): + def fit(self, train_df,ct_entry=None): # get location features if self.loc_feature == 'cluster': # fit clustering model(s) and one-hot encode their indices From 59633e04e99ebb7e95b25c11fd1cb1c40b321137 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Wed, 30 Aug 2023 15:32:29 -0400 Subject: [PATCH 06/28] Fixing `models.py` to support `regenerate_classification_performance_results.py` Prior to this update, `NaiveBinningClassifier` in 'models.py' had dependencies on both of tour model and trip model. Now, this classifier is completely dependent on trip model. All the other notebooks (except `classification_performance.ipynb`) were tested as well and they are working as usual. Other minor fixes to support previous changes. --- TRB_label_assist/models.py | 40 +++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/TRB_label_assist/models.py b/TRB_label_assist/models.py index 5bca3b31f..1ba9339db 100644 --- a/TRB_label_assist/models.py +++ b/TRB_label_assist/models.py @@ -24,6 +24,12 @@ from emission.analysis.classification.inference.labels.inferrers import predict_cluster_confidence_discounting import emission.core.wrapper.entry as ecwe import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg +import emission.analysis.modelling.tour_model.similarity as eamts +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.run_model as eamur + + import clustering # NOTE: tour_model_extended.similarity is on the # eval-private-data-compatibility branch in e-mission-server @@ -343,9 +349,9 @@ def predict(self, test_df): self.test_df = self._clean_data(test_df) if self.loc_type == 'start': - bins = self.sim_model.start_bins + bins = self.sim_model.bins elif self.loc_type == 'end': - bins = self.sim_model.end_bins + bins = self.sim_model.bins labels = [] @@ -376,7 +382,7 @@ def _match(self, trip, bin, loc_type): copied from the Similarity class on the e-mission-server. """ for t_idx in bin: - trip_in_bin = self.train_df.iloc[t_idx] + trip_in_bin = self.train_df.iloc[int(t_idx)] if not self._distance_helper(trip, trip_in_bin, loc_type): return False return True @@ -666,21 +672,29 @@ def fit(self, train_df,ct_entry=None): # only accepts lists of Entry objects train_trips = self._trip_df_to_list(train_df) - sim, bins, bin_trips, train_trips = ep.first_round( - train_trips, self.radius) - + + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": self.radius, # meters, + "apply_cutoff": False, + "clustering_way": "origin-destination", #cause thats what is set in performance_eval.py for this model + "incremental_evaluation": False + } + + sim_model = eamtg.GreedySimilarityBinning(model_config) + sim_model.fit(train_trips) # set instance variables so we can access results later as well - self.sim = sim - self.bins = bins + self.sim = sim_model + self.bins = sim_model.bins # save all user labels user_id = train_df.user_id.iloc[0] - bsm.save_models('user_labels', - bsm.create_user_input_map(train_trips, bins), user_id) + model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE + model_data_next=sim_model.to_dict() + last_done_ts = eamur._latest_timestamp(train_trips) + eamums.save_model(user_id, model_type, model_data_next, last_done_ts, model_storage) - # save location features of all bins - bsm.save_models('locations', bsm.create_location_map(train_trips, bins), - user_id) return self def predict_proba(self, test_df): From 0adb5fe4256592934a06303fa020b12ec4d738ac Mon Sep 17 00:00:00 2001 From: $aTyam Date: Thu, 14 Sep 2023 15:24:10 -0400 Subject: [PATCH 07/28] [PARTIALLY TESTED] Single database read and Code Cleanuo 1. removed mentions of `tour_model` or `tour_model_first_only` . 2. removed two reads from database. 3. Removed notebook outputs ( this could be the reason a few diffs are too big to view) --- TRB_label_assist/models.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/TRB_label_assist/models.py b/TRB_label_assist/models.py index 1ba9339db..475f0b6d0 100644 --- a/TRB_label_assist/models.py +++ b/TRB_label_assist/models.py @@ -19,12 +19,10 @@ from clustering import get_distance_matrix, single_cluster_purity import data_wrangling import emission.storage.decorations.trip_queries as esdtq -import emission.analysis.modelling.tour_model_first_only.build_save_model as bsm -import emission.analysis.modelling.tour_model_first_only.evaluation_pipeline as ep from emission.analysis.classification.inference.labels.inferrers import predict_cluster_confidence_discounting import emission.core.wrapper.entry as ecwe import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg -import emission.analysis.modelling.tour_model.similarity as eamts +import emission.core.common as ecc import emission.analysis.modelling.trip_model.model_storage as eamums import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.run_model as eamur @@ -398,8 +396,8 @@ def _distance_helper(self, tripa, tripb, loc_type): ptb_lat = tripb[[loc_type + '_lat']] ptb_lon = tripb[[loc_type + '_lon']] - return eamts.within_radius(pta_lat, pta_lon, ptb_lat, ptb_lon, - self.radius) + dist= ecc.calDistance([pta_lon,pta_lat],[ptb_lon,ptb_lat]) + return dist <= self.radius class DBSCANSVMCluster(Cluster): From e9abd5173255a3d68539d44e73bdf4b30674fea8 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Mon, 2 Oct 2023 09:35:56 -0400 Subject: [PATCH 08/28] [PARTIALLY TESTED] Survey Assist Using RF RF initialisation and fit function. Build test written and tested. fit of Random forest uses df . --- conf/analysis/trip_model.conf.json.sample | 20 +- .../modelling/trip_model/dbscan_svm.py | 250 +++++++++++++ .../modelling/trip_model/forest_classifier.py | 335 ++++++++++++++++++ .../trip_model/greedy_similarity_binning.py | 3 +- .../modelling/trip_model/model_type.py | 5 +- .../modelling/trip_model/run_model.py | 9 +- .../analysis/modelling/trip_model/util.py | 139 +++++++- .../modellingTests/TestRunForestModel.py | 200 +++++++++++ 8 files changed, 950 insertions(+), 11 deletions(-) create mode 100644 emission/analysis/modelling/trip_model/dbscan_svm.py create mode 100644 emission/analysis/modelling/trip_model/forest_classifier.py create mode 100644 emission/tests/modellingTests/TestRunForestModel.py diff --git a/conf/analysis/trip_model.conf.json.sample b/conf/analysis/trip_model.conf.json.sample index 845e67a6a..4851be5d6 100644 --- a/conf/analysis/trip_model.conf.json.sample +++ b/conf/analysis/trip_model.conf.json.sample @@ -1,5 +1,5 @@ { - "model_type": "greedy", + "model_type": "forest", "model_storage": "document_database", "minimum_trips": 14, "model_parameters": { @@ -8,6 +8,24 @@ "similarity_threshold_meters": 500, "apply_cutoff": false, "incremental_evaluation": false + }, + "forest": { + "loc_feature" : "coordinates", + "radius": 100, + "size_thresh":1, + "purity_thresh":1.0, + "gamma":0.05, + "C":1, + "n_estimators":100, + "criterion":"gini", + "max_depth":null, + "min_samples_split":2, + "min_samples_leaf":1, + "max_features":"sqrt", + "bootstrap":true, + "random_state":42, + "use_start_clusters":false, + "use_trip_clusters":true } } } \ No newline at end of file diff --git a/emission/analysis/modelling/trip_model/dbscan_svm.py b/emission/analysis/modelling/trip_model/dbscan_svm.py new file mode 100644 index 000000000..58cd8f7e0 --- /dev/null +++ b/emission/analysis/modelling/trip_model/dbscan_svm.py @@ -0,0 +1,250 @@ +import emission.analysis.modelling.trip_model.trip_model as eamuu +from sklearn.cluster import DBSCAN +import logging +import numpy as np +import pandas as pd +import emission.analysis.modelling.trip_model.util as eamtu +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import make_pipeline +from sklearn import svm +from sklearn.metrics.pairwise import haversine_distances + +EARTH_RADIUS = 6371000 + +class DBSCANSVMCluster(eamuu.TripModel): + """ DBSCAN-based clustering algorithm that optionally implements SVM + sub-clustering. + + Args: + loc_type (str): 'start' or 'end', the type of point to cluster + radius (int): max distance between two points in each other's + neighborhood, i.e. DBSCAN's eps value. does not strictly + dictate final cluster size + size_thresh (int): the min number of trips a cluster must have + to be considered for SVM sub-division + purity_thresh (float): the min purity a cluster must have + to be sub-divided using SVM + gamma (float): coefficient for the rbf kernel in SVM + C (float): regularization hyperparameter for SVM + + Attributes: + loc_type (str) + radius (int) + size_thresh (int) + purity_thresh (float) + gamma (float) + C (float) + train_df (DataFrame) + test_df (DataFrame) + base_model (sklearn Estimator) + """ + + def __init__(self, + loc_type='end', + radius=100, + svm=True, + size_thresh=1, + purity_thresh=1.0, + gamma=0.05, + C=1): + logging.info("PERF: Initializing DBSCANSVMCluster") + self.loc_type = loc_type + self.radius = radius + self.svm = svm + self.size_thresh = size_thresh + self.purity_thresh = purity_thresh + self.gamma = gamma + self.C = C + + def set_params(self, params): + if 'loc_type' in params.keys(): self.loc_type = params['loc_type'] + if 'radius' in params.keys(): self.radius = params['radius'] + if 'svm' in params.keys(): self.svm = params['svm'] + if 'size_thresh' in params.keys(): + self.size_thresh = params['size_thresh'] + if 'purity_thresh' in params.keys(): + self.purity_thresh = params['purity_thresh'] + if 'gamma' in params.keys(): self.gamma = params['gamma'] + + return self + + def fit(self, train_df,ct_entry=None): + """ Creates clusters of trip points. + self.train_df will be updated with columns containing base and + final clusters. + + TODO: perhaps move the loc_type argument to fit() so we can use a + single class instance to cluster both start and end points. This + will also help us reduce duplicate data. + + Args: + train_df (dataframe): dataframe of labeled trips + ct_entry (List) : A list of Entry type of labeled and unlabeled trips + """ + ################## + ### clean data ### + ################## + logging.info("PERF: Fitting DBSCANSVMCluster") + self.train_df = self._clean_data(train_df) + + # we can use all trips as long as they have purpose labels. it's ok if + # they're missing mode/replaced-mode labels, because they aren't as + # strongly correlated with location compared to purpose + # TODO: actually, we may want to rethink this. for example, it will + # probably be helpful to include trips that are missing purpose labels + # but still have mode labels. + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + ######################### + ### get base clusters ### + ######################### + dist_matrix_meters = eamtu.get_distance_matrix(self.train_df, self.loc_type) + self.base_model = DBSCAN(self.radius, + metric="precomputed", + min_samples=1).fit(dist_matrix_meters) + base_clusters = self.base_model.labels_ + + self.train_df.loc[:, + f'{self.loc_type}_base_cluster_idx'] = base_clusters + + ######################## + ### get sub-clusters ### + ######################## + # copy base cluster column into final cluster column + self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = self.train_df[ + f'{self.loc_type}_base_cluster_idx'] + + if self.svm: + c = 0 # count of how many clusters we have iterated over + + # iterate over all clusters and subdivide them with SVM. the while + # loop is so we can do multiple iterations of subdividing if needed + while c < self.train_df[f'{self.loc_type}_cluster_idx'].max(): + points_in_cluster = self.train_df[ + self.train_df[f'{self.loc_type}_cluster_idx'] == c] + + # only do SVM if we have the minimum num of trips in the cluster + if len(points_in_cluster) < self.size_thresh: + c += 1 + continue + + # only do SVM if purity is below threshold + purity = eamtu.single_cluster_purity(points_in_cluster, + label_col='purpose_true') + if purity < self.purity_thresh: + X = points_in_cluster[[ + f"{self.loc_type}_lon", f"{self.loc_type}_lat" + ]] + y = points_in_cluster.purpose_true.to_list() + + svm_model = make_pipeline( + StandardScaler(), + svm.SVC( + kernel='rbf', + gamma=self.gamma, + C=self.C, + )).fit(X, y) + labels = svm_model.predict(X) + unique_labels = np.unique(labels) + + # if the SVM predicts that all points in the cluster have + # the same label, just ignore it and don't reindex. + # this also helps us to handle the possibility that a + # cluster may be impure but inherently inseparable, e.g. an + # end cluster at a user's home, containing 50% trips from + # work to home and 50% round trips that start and end at + # home. we don't want to reindex otherwise the low purity + # will trigger SVM again, and we will attempt & fail to + # split the cluster ad infinitum + if len(unique_labels) > 1: + # map purpose labels to new cluster indices + # we offset indices by the max existing index so that we + # don't run into any duplicate indices + max_existing_idx = self.train_df[ + f'{self.loc_type}_cluster_idx'].max() + label_to_cluster = { + unique_labels[i]: i + max_existing_idx + 1 + for i in range(len(unique_labels)) + } + # update trips with their new cluster indices + indices = np.array( + [label_to_cluster[l] for l in labels]) + self.train_df.loc[ + self.train_df[f'{self.loc_type}_cluster_idx'] == c, + f'{self.loc_type}_cluster_idx'] = indices + + c += 1 + # TODO: make things categorical at the end? or maybe at the start of the decision tree pipeline + + return self + + def fit_predict(self, train_df): + """ Override to avoid unnecessarily computation of distance matrices. + """ + self.fit(train_df) + return self.train_df[[f'{self.loc_type}_cluster_idx']] + + def predict(self, test_df): + logging.info("PERF: Predicting DBSCANSVMCluster") + # TODO: store clusters as polygons so the prediction is faster + # TODO: we probably don't want to store test_df in self to be more memory-efficient + self.test_df = self._clean_data(test_df) + pred_clusters = self._NN_predict(self.test_df) + + self.test_df.loc[:, f'{self.loc_type}_cluster_idx'] = pred_clusters + + return self.test_df[[f'{self.loc_type}_cluster_idx']] + + def _NN_predict(self, test_df): + """ Generate base-cluster predictions for the test data using a + nearest-neighbor approach. + + sklearn doesn't implement predict() for DBSCAN, which is why we + need a custom method. + """ + logging.info("PERF: NN_predicting DBSCANSVMCluster") + n_samples = test_df.shape[0] + labels = np.ones(shape=n_samples, dtype=int) * -1 + + # get coordinates of core points (we can't use model.components_ + # because our input feature was a distance matrix and doesn't contain + # info about the raw coordinates) + # NOTE: technically, every single point in a cluster is a core point + # because it has at least minPts (2) points, including itself, in its + # radius + train_coordinates = self.train_df[[ + f'{self.loc_type}_lat', f'{self.loc_type}_lon' + ]] + train_radians = np.radians(train_coordinates) + + for idx, row in test_df.reset_index(drop=True).iterrows(): + # calculate the distances between the ith test data and all points, + # then find the index of the closest point. if the ith test data is + # within epsilon of the point, then assign its cluster to the ith + # test data (otherwise, leave it as -1, indicating noise). + # unfortunately, pairwise_distances_argmin() does not support + # haversine distance, so we have to reimplement it ourselves + new_loc_radians = np.radians( + row[[self.loc_type + "_lat", self.loc_type + "_lon"]].to_list()) + new_loc_radians = np.reshape(new_loc_radians, (1, 2)) + dist_matrix_meters = haversine_distances( + new_loc_radians, train_radians) * EARTH_RADIUS + + shortest_dist_idx = np.argmin(dist_matrix_meters) + if dist_matrix_meters[0, shortest_dist_idx] < self.radius: + labels[idx] = self.train_df.reset_index( + drop=True).loc[shortest_dist_idx, + f'{self.loc_type}_cluster_idx'] + + return labels + diff --git a/emission/analysis/modelling/trip_model/forest_classifier.py b/emission/analysis/modelling/trip_model/forest_classifier.py new file mode 100644 index 000000000..0816717db --- /dev/null +++ b/emission/analysis/modelling/trip_model/forest_classifier.py @@ -0,0 +1,335 @@ +import pandas as pd +from sklearn.preprocessing import OneHotEncoder +import joblib +from typing import Dict, List, Optional, Tuple +import emission.core.wrapper.confirmedtrip as ecwc +import logging +import numpy as np + + +import emission.analysis.modelling.trip_model.trip_model as eamuu +import emission.analysis.modelling.trip_model.dbscan_svm as eamtd +import emission.analysis.modelling.trip_model.util as eamtu +import emission.analysis.modelling.trip_model.config as eamtc + +from sklearn.ensemble import RandomForestClassifier +class ForestClassifier(eamuu.TripModel): + + def __init__(self,config=None): + + # expected_keys = [ + # 'metric', + # 'similarity_threshold_meters', + # 'apply_cutoff', + # 'incremental_evaluation' + # ] + # for k in expected_keys: + # if config.get(k) is None: + # msg = f"greedy trip model config missing expected key {k}" + # raise KeyError(msg) + if config is None: + config = eamtc.get_config_value_or_raise('model_parameters.forest') + logging.debug(f'ForestClassifier loaded model config from file') + else: + logging.debug(f'ForestClassifier using model config argument') + + self.loc_feature = config['loc_feature'] + self.radius = config['radius'] + self.size_thresh = config['size_thresh'] + self.purity_thresh = config['purity_thresh'] + self.gamma = config['gamma'] + self.C = config['C'] + self.n_estimators = config['n_estimators'] + self.criterion =config['criterion'] + self.max_depth = config['max_depth'] if config['max_depth'] != 'null' else None + self.min_samples_split = config['min_samples_split'] + self.min_samples_leaf = config['min_samples_leaf'] + self.max_features = config['max_features'] + self.bootstrap = config['bootstrap'] + self.random_state = config['random_state'] + # self.drop_unclustered = drop_unclustered + self.use_start_clusters = config['use_start_clusters'] + self.use_trip_clusters = config['use_trip_clusters'] + + if self.loc_feature == 'cluster': + # clustering algorithm to generate end clusters + self.end_cluster_model = eamtd.DBSCANSVMCluster( + loc_type='end', + radius=self.radius, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + + if self.use_start_clusters or self.use_trip_clusters: + # clustering algorithm to generate start clusters + self.start_cluster_model = eamtd.DBSCANSVMCluster( + loc_type='start', + radius=self.radius, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + + if self.use_trip_clusters: + # helper class to generate trip-level clusters + self.trip_grouper = eamtd.TripGrouper( + start_cluster_col='start_cluster_idx', + end_cluster_col='end_cluster_idx') + + # wrapper class to generate one-hot encodings for cluster indices + self.cluster_enc = eamtu.OneHotWrapper(sparse=False, + handle_unknown='ignore') + + # wrapper class to generate one-hot encodings for purposes and modes + self.purpose_enc = eamtu.OneHotWrapper(impute_missing=True, + sparse=False, + handle_unknown='error') + self.mode_enc = eamtu.OneHotWrapper(impute_missing=True, + sparse=False, + handle_unknown='error') + + # ensemble classifiers for each label category + self.purpose_predictor = RandomForestClassifier( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + bootstrap=self.bootstrap, + random_state=self.random_state) + self.mode_predictor = RandomForestClassifier( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + bootstrap=self.bootstrap, + random_state=self.random_state) + self.replaced_predictor = RandomForestClassifier( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + bootstrap=self.bootstrap, + random_state=self.random_state) + + + def fit(self,data: List[ecwc.Confirmedtrip],data_df=None): + # get location features + if self.loc_feature == 'cluster': + # fit clustering model(s) and one-hot encode their indices + # TODO: consolidate start/end_cluster_model in a single instance + # that has a location_type parameter in the fit() method + self.end_cluster_model.fit(data_df) + + clusters_to_encode = self.end_cluster_model.train_df[[ + 'end_cluster_idx' + ]].copy() # copy is to avoid SettingWithCopyWarning + + if self.use_start_clusters or self.use_trip_clusters: + self.start_cluster_model.fit(data_df) + + if self.use_start_clusters: + clusters_to_encode = pd.concat([ + clusters_to_encode, + self.start_cluster_model.train_df[['start_cluster_idx']] + ], + axis=1) + if self.use_trip_clusters: + start_end_clusters = pd.concat([ + self.end_cluster_model.train_df[['end_cluster_idx']], + self.start_cluster_model.train_df[['start_cluster_idx']] + ], + axis=1) + trip_cluster_idx = self.trip_grouper.fit_transform( + start_end_clusters) + clusters_to_encode.loc[:, + 'trip_cluster_idx'] = trip_cluster_idx + + loc_features_df = self.cluster_enc.fit_transform( + clusters_to_encode.astype(int)) + + # clean the df again because we need it in the next step + # TODO: remove redundancy + self.train_df = self._clean_data(data_df) + + # TODO: move below code into a reusable function + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + else: # self.loc_feature == 'coordinates' + self.train_df = self._clean_data(data_df) + + # TODO: move below code into a reusable function + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + loc_features_df = self.train_df[[ + 'start_lon', 'start_lat', 'end_lon', 'end_lat' + ]] + + # prepare data for the ensemble classifiers + + # note that we want to use purpose data to aid our mode predictions, + # and use both purpose and mode data to aid our replaced-mode + # predictions + # thus, we want to one-hot encode the purpose and mode as data + # features, but also preserve an unencoded copy for the target columns + + # dataframe holding all features and targets + self.Xy_train = pd.concat( + [self.train_df[self.base_features + self.targets], loc_features_df], + axis=1) + + # encode purposes and modes + onehot_purpose_df = self.purpose_enc.fit_transform( + self.Xy_train[['purpose_true']], output_col_prefix='purpose') + onehot_mode_df = self.mode_enc.fit_transform( + self.Xy_train[['mode_true']], output_col_prefix='mode') + self.Xy_train = pd.concat( + [self.Xy_train, onehot_purpose_df, onehot_mode_df], axis=1) + + # for predicting purpose, drop encoded purpose and mode features, as + # well as all target labels + self.X_purpose = self.Xy_train.dropna(subset=['purpose_true']).drop( + labels=self.targets + self.purpose_enc.onehot_encoding_cols + + self.mode_enc.onehot_encoding_cols, + axis=1) + + # for predicting mode, we want to keep purpose data + self.X_mode = self.Xy_train.dropna(subset=['mode_true']).drop( + labels=self.targets + self.mode_enc.onehot_encoding_cols, axis=1) + + # for predicting replaced-mode, we want to keep purpose and mode data + self.X_replaced = self.Xy_train.dropna(subset=['replaced_true']).drop( + labels=self.targets, axis=1) + + self.y_purpose = self.Xy_train['purpose_true'].dropna() + self.y_mode = self.Xy_train['mode_true'].dropna() + self.y_replaced = self.Xy_train['replaced_true'].dropna() + + # fit classifiers + if len(self.X_purpose) > 0: + self.purpose_predictor.fit(self.X_purpose, self.y_purpose) + if len(self.X_mode) > 0: + self.mode_predictor.fit(self.X_mode, self.y_mode) + if len(self.X_replaced) > 0: + self.replaced_predictor.fit(self.X_replaced, self.y_replaced) + + def predict(self, data: List[float]) -> Tuple[List[Dict], int]: + pass + + def to_dict(self) -> Dict: + return joblib.dump(self,compress=3) + + def from_dict(self, model: Dict): + pass + + def is_incremental(self) -> bool: + pass + + def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: + pass + + def _clean_data(self, df): + """ Clean a dataframe of trips. + (Drop trips with missing start/end locations, expand the user input + columns, ensure all essential columns are present) + + Args: + df: a dataframe of trips. must contain the columns 'start_loc', + 'end_loc', and should also contain the user input columns + ('mode_confirm', 'purpose_confirm', 'replaced_mode') if + available + """ + assert 'start_loc' in df.columns and 'end_loc' in df.columns + + # clean up the dataframe by dropping entries with NaN locations and + # reset index + num_nan = 0 + if df.start_loc.isna().any(): + num_nan += df.start_loc.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['start_loc']) + if df.end_loc.isna().any(): + num_nan += df.end_loc.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['end_loc']) + + # expand the 'start_loc' and 'end_loc' column into 'start_lat', + # 'start_lon', 'end_lat', and 'end_lon' columns + df = self.expand_coords(df) + + # drop trips with missing coordinates + if df.start_lat.isna().any(): + num_nan += df.start_lat.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['start_lat']) + if df.start_lon.isna().any(): + num_nan += df.start_lon.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['start_lon']) + if df.end_lat.isna().any(): + num_nan += df.end_lat.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['end_lat']) + if df.end_lon.isna().any(): + num_nan = df.end_lon.value_counts(dropna=False).loc[np.nan] + df += df.dropna(subset=['end_lon']) + if num_nan > 0: + logging.info( + f'dropped {num_nan} trips that are missing location coordinates' + ) + + df = df.rename( + columns={ + 'mode_confirm': 'mode_true', + 'purpose_confirm': 'purpose_true', + 'replaced_mode': 'replaced_true' + }) + + for category in ['mode_true', 'purpose_true', 'replaced_true']: + if category not in df.columns: + # for example, if a user labels all their trip modes but none of their trip purposes + df.loc[:, category] = np.nan + + return df.reset_index(drop=True) + + def expand_coords(exp_df, purpose=None): + """ + copied and modifed from get_loc_df_for_purpose() in the 'Radius + selection' notebook + """ + purpose_trips = exp_df + if purpose is not None: + purpose_trips = exp_df[exp_df.purpose_confirm == purpose] + + dfs = [purpose_trips] + for loc_type in ['start', 'end']: + df = pd.DataFrame( + purpose_trips[loc_type + + "_loc"].apply(lambda p: p["coordinates"]).to_list(), + columns=[loc_type + "_lon", loc_type + "_lat"]) + df = df.set_index(purpose_trips.index) + dfs.append(df) + + # display.display(end_loc_df.head()) + return pd.concat(dfs, axis=1) \ No newline at end of file diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 226fdefb5..4e9ee6fad 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -128,11 +128,12 @@ class label to apply: self.bins: Dict[str, Dict] = {} - def fit(self, trips: List[ecwc.Confirmedtrip]): + def fit(self, trips: List[ecwc.Confirmedtrip],tripsdf=None): """train the model by passing data, where each row in the data corresponds to a label at the matching index of the label input :param trips: 2D array of features to train from + :param tripsdf: trips data in dataframe format """ logging.debug(f'fit called with {len(trips)} trips') diff --git a/emission/analysis/modelling/trip_model/model_type.py b/emission/analysis/modelling/trip_model/model_type.py index b5e761fb0..2d7e6f743 100644 --- a/emission/analysis/modelling/trip_model/model_type.py +++ b/emission/analysis/modelling/trip_model/model_type.py @@ -3,6 +3,7 @@ import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.similarity.od_similarity as eamso import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamug +import emission.analysis.modelling.trip_model.forest_classifier as eamuf SIMILARITY_THRESHOLD_METERS=500 @@ -11,6 +12,7 @@ class ModelType(Enum): # ENUM_NAME_CAPS = 'SHORTHAND_NAME_CAPS' GREEDY_SIMILARITY_BINNING = 'GREEDY' + RANDOM_FOREST_CLASSIFIER = 'FOREST' def build(self, config=None) -> eamuu.TripModel: """ @@ -25,7 +27,8 @@ def build(self, config=None) -> eamuu.TripModel: """ # Dict[ModelType, TripModel] MODELS = { - ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config) + #ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config), + ModelType.RANDOM_FOREST_CLASSIFIER: eamuf.ForestClassifier(config) } model = MODELS.get(self) if model is None: diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index e3e2b1c4e..45b524f9a 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -56,8 +56,10 @@ def update_trip_model( logging.debug(f'model type {model_type.name} is incremental? {model.is_incremental}') logging.debug(f'time query for training data collection: {time_query}') + ts = esta.TimeSeries.get_time_series(user_id) trips = _get_training_data(user_id, time_query) - + + trips_df = ts.to_data_df("analysis/confirmed_trip",trips) # don't start training for a user that doesn't have at least $trips many trips # (assume if a stored model exists for the user, that they met this requirement previously) if len(trips) == 0: @@ -73,8 +75,9 @@ def update_trip_model( epq.mark_trip_model_failed(user_id) else: - # train and store the model - model.fit(trips) + # train and store the model. pass both List of event and dataframe time data + # that both standard( which mostly work on df) and self implemented models can use. + model.fit(trips,trips_df) model_data_next = model.to_dict() if len(model_data_next) == 0: diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index 7d22b5d22..0728fb702 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -1,7 +1,13 @@ from typing import List, Tuple from past.utils import old_div -import numpy +import numpy as np +import pandas as pd from numpy.linalg import norm +import copy + +from sklearn.preprocessing import OneHotEncoder +from sklearn.pipeline import make_pipeline +from sklearn.impute import SimpleImputer def find_knee_point(values: List[float]) -> Tuple[float, int]: @@ -26,16 +32,139 @@ def find_knee_point(values: List[float]) -> Tuple[float, int]: x = list(range(N)) max = 0 index = -1 - a = numpy.array([x[0], values[0]]) - b = numpy.array([x[-1], values[-1]]) + a = np.array([x[0], values[0]]) + b = np.array([x[-1], values[-1]]) n = norm(b - a) new_y = [] for i in range(0, N): - p = numpy.array([x[i], values[i]]) - dist = old_div(norm(numpy.cross(p - a, p - b)), n) + p = np.array([x[i], values[i]]) + dist = old_div(norm(np.cross(p - a, p - b)), n) new_y.append(dist) if dist > max: max = dist index = i value = values[index] return [index, value] + + def get_distance_matrix(loc_df, loc_type): + """ Args: + loc_df (dataframe): must have columns 'start_lat' and 'start_lon' + or 'end_lat' and 'end_lon' + loc_type (str): 'start' or 'end' + """ + assert loc_type == 'start' or loc_type == 'end' + + radians_lat_lon = np.radians(loc_df[[loc_type + "_lat", loc_type + "_lon"]]) + + dist_matrix_meters = pd.DataFrame( + smp.haversine_distances(radians_lat_lon, radians_lat_lon) * + EARTH_RADIUS) + return dist_matrix_meters + +def single_cluster_purity(points_in_cluster, label_col='purpose_confirm'): + """ Calculates purity of a cluster (i.e. % of trips that have the most + common label) + + Args: + points_in_cluster (df): dataframe containing points in the same + cluster + label_col (str): column in the dataframe containing labels + """ + assert label_col in points_in_cluster.columns + + most_freq_label = points_in_cluster[label_col].mode()[0] + purity = len(points_in_cluster[points_in_cluster[label_col] == + most_freq_label]) / len(points_in_cluster) + return purity + + +class OneHotWrapper(): + """ Helper class to streamline one-hot encoding. + + Args: + impute_missing (bool): whether or not to impute np.nan values. + sparse (bool): whether or not to return a sparse matrix. + handle_unknown (str): specifies the way unknown categories are + handled during transform. + """ + + def __init__( + self, + impute_missing=False, + sparse=False, + handle_unknown='ignore', + ): + self.impute_missing = impute_missing + if self.impute_missing: + self.encoder = make_pipeline( + SimpleImputer(missing_values=np.nan, + strategy='constant', + fill_value='missing'), + OneHotEncoder(sparse=False, handle_unknown=handle_unknown)) + else: + self.encoder = OneHotEncoder(sparse=sparse, + handle_unknown=handle_unknown) + + def fit_transform(self, train_df, output_col_prefix=None): + """ Establish one-hot encoded variables. + + Args: + train_df (DataFrame): DataFrame containing train trips. + output_col_prefix (str): only if train_df is a single column + """ + # TODO: handle pd series + + train_df = train_df.copy() # to avoid SettingWithCopyWarning + + # if imputing, the dtype of each column must be string/object and not + # numerical, otherwise the SimpleImputer will fail + if self.impute_missing: + for col in train_df.columns: + train_df[col] = train_df[col].astype(object) + onehot_encoding = self.encoder.fit_transform(train_df) + self.onehot_encoding_cols_all = [] + for col in train_df.columns: + if train_df.shape[1] > 1 or output_col_prefix is None: + output_col_prefix = col + self.onehot_encoding_cols_all += [ + f'{output_col_prefix}_{val}' + for val in np.sort(train_df[col].dropna().unique()) + ] + # we handle np.nan separately because it is of type float, and may + # cause issues with np.sort if the rest of the unique values are + # strings + if any((train_df[col].isna())): + self.onehot_encoding_cols_all += [f'{output_col_prefix}_nan'] + + onehot_encoding_df = pd.DataFrame( + onehot_encoding, + columns=self.onehot_encoding_cols_all).set_index(train_df.index) + + # ignore the encoded columns for missing entries + self.onehot_encoding_cols = copy.deepcopy(self.onehot_encoding_cols_all) + for col in self.onehot_encoding_cols_all: + if col.endswith('_nan'): + onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) + self.onehot_encoding_cols.remove(col) + + return onehot_encoding_df.astype(int) + + def transform(self, test_df): + """ One-hot encoded features in accordance with features seen in the + train set. + + Args: + test_df (DataFrame): DataFrame of trips. + """ + # TODO: rename test_df, this one doesn't necessarily need to be a df + onehot_encoding = self.encoder.transform(test_df) + onehot_encoding_df = pd.DataFrame( + onehot_encoding, + columns=self.onehot_encoding_cols_all).set_index(test_df.index) + + # ignore the encoded columns for missing entries + for col in self.onehot_encoding_cols_all: + if col.endswith('_nan'): + onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) + + return onehot_encoding_df.astype(int) \ No newline at end of file diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py new file mode 100644 index 000000000..b668c22b3 --- /dev/null +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -0,0 +1,200 @@ +import unittest +import logging + +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.run_model as eamur +import emission.storage.timeseries.abstract_timeseries as esta +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.core.get_database as edb +import emission.storage.pipeline_queries as epq +import emission.core.wrapper.pipelinestate as ecwp + + +class TestRunForestModel(unittest.TestCase): + """these tests were copied forward during a refactor of the tour model + [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] + + it's uncertain what condition they are in besides having been refactored to + use the more recent tour modeling code. + """ + + def setUp(self): + """ + sets up the end-to-end run model test with Confirmedtrip data + """ + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + # configuration for randomly-generated test data + self.user_id = user_id = 'TestRunForestModel-TestData' + self.origin = (-105.1705977, 39.7402654,) + self.destination = (-105.1755606, 39.7673075) + self.min_trips = 14 + self.total_trips = 100 + self.clustered_trips = 33 # must have at least self.min_trips similar trips by default + self.has_label_percent = 0.9 # let's make a few that don't have a label, but invariant + # $clustered_trips * $has_label_percent > self.min_trips + # must be correct or else this test could fail under some random test cases. + + # for a negative test, below + self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' + + # test data can be saved between test invocations, check if data exists before generating + ts = esta.TimeSeries.get_time_series(user_id) + test_data = list(ts.find_entries(["analysis/confirmed_trip"])) + if len(test_data) == 0: + # generate test data for the database + logging.debug(f"inserting mock Confirmedtrips into database") + + # generate labels with a known sample weight that we can rely on in the test + label_data = { + "mode_confirm": ['ebike', 'bike'], + "purpose_confirm": ['happy-hour', 'dog-park'], + "replaced_mode": ['walk'], + "mode_weights": [0.9, 0.1], + "purpose_weights": [0.1, 0.9] + } + + train = etmm.generate_mock_trips( + user_id=user_id, + trips=self.total_trips, + origin=self.origin, + destination=self.destination, + trip_part='od', + label_data=label_data, + within_threshold=self.clustered_trips, + threshold=0.004, # ~400m + has_label_p=self.has_label_percent + ) + + ts.bulk_insert(train) + + # confirm data write did not fail + test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) + if len(test_data) != self.total_trips: + logging.debug(f'test invariant failed after generating test data') + self.fail() + else: + logging.debug(f'found {self.total_trips} trips in database') + + def tearDown(self): + """ + clean up database + """ + edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id}) + edb.get_model_db().delete_many({'user_id': self.user_id}) + edb.get_pipeline_state_db().delete_many({'user_id': self.user_id}) + + def testBuildForestModelFromConfig(self): + """ + forest model takes config arguments via the constructor for testing + purposes but will load from a file in /conf/analysis/ which is tested here + """ + + eamumt.ModelType.RANDOM_FOREST_CLASSIFIER.build() + # success if it didn't throw + + def testTrainForestModelWithZeroTrips(self): + """ + forest model takes config arguments via the constructor for testing + purposes but will load from a file in /conf/analysis/ which is tested here + """ + + # pass along debug model configuration + forest_model_config= { + "loc_feature" : "coordinates", + "radius": 500, + "size_thresh":1, + "purity_thresh":1.0, + "gamma":0.05, + "C":1, + "n_estimators":100, + "criterion":"gini", + "max_depth":'null', + "min_samples_split":2, + "min_samples_leaf":1, + "max_features":"sqrt", + "bootstrap":True, + "random_state":42, + "use_start_clusters":False, + "use_trip_clusters":True + } + + logging.debug(f'~~~~ do nothing ~~~~') + eamur.update_trip_model( + user_id=self.unused_user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips, + model_config=forest_model_config + ) + + # user had no entries so their pipeline state should not have been set + # if it was set, the time query here would + stage = ecwp.PipelineStages.TRIP_MODEL + pipeline_state = epq.get_current_state(self.unused_user_id, stage) + self.assertIsNone( + pipeline_state['curr_run_ts'], + "pipeline should not have a current timestamp for the test user") + +# TODO :complete this test once prediction part is done + +''' + def test1RoundTripGreedySimilarityBinning(self): + """ + train a model, save it, load it, and use it for prediction, using + the high-level training/testing API provided via + run_model.py:update_trip_model() # train + run_model.py:predict_labels_with_n() # test + + for clustering, use the default greedy similarity binning model + """ + + # pass along debug model configuration + forest_model_config= { + "loc_feature" : "coordinates", + "radius": 500, + "size_thresh":1, + "purity_thresh":1.0, + "gamma":0.05, + "C":1, + "n_estimators":100, + "criterion":"gini", + "max_depth":'null', + "min_samples_split":2, + "min_samples_leaf":1, + "max_features":"sqrt", + "bootstrap":True, + "random_state":42, + "use_start_clusters":False, + "use_trip_clusters":True + } + + logging.debug(f'(TRAIN) creating a model based on trips in database') + eamur.update_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips, + model_config=forest_model_config + ) + + logging.debug(f'(TEST) testing prediction of stored model') + test = etmm.build_mock_trip( + user_id=self.user_id, + origin=self.origin, + destination=self.destination + ) + prediction, n = eamur.predict_labels_with_n( + trip = test, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + model_config=forest_model_config + ) + + [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)] + + self.assertNotEqual(len(prediction), 0, "should have a prediction") +''' From 3820d875f40d374792d19010772c961047b44cf3 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Tue, 3 Oct 2023 09:26:37 -0400 Subject: [PATCH 09/28] [NOT TESTED]Predict implemented Predict is now included. Just need to figure out model storage and model testing. --- .../modelling/trip_model/forest_classifier.py | 280 ++++++++++++++++-- .../trip_model/greedy_similarity_binning.py | 2 +- .../modelling/trip_model/run_model.py | 5 +- 3 files changed, 257 insertions(+), 30 deletions(-) diff --git a/emission/analysis/modelling/trip_model/forest_classifier.py b/emission/analysis/modelling/trip_model/forest_classifier.py index 0816717db..8041ecc44 100644 --- a/emission/analysis/modelling/trip_model/forest_classifier.py +++ b/emission/analysis/modelling/trip_model/forest_classifier.py @@ -2,37 +2,64 @@ from sklearn.preprocessing import OneHotEncoder import joblib from typing import Dict, List, Optional, Tuple +from sklearn.metrics.pairwise import haversine_distances import emission.core.wrapper.confirmedtrip as ecwc import logging import numpy as np - +import copy import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.trip_model.dbscan_svm as eamtd import emission.analysis.modelling.trip_model.util as eamtu import emission.analysis.modelling.trip_model.config as eamtc +import emission.storage.timeseries.builtin_timeseries as estb +from sklearn.exceptions import NotFittedError from sklearn.ensemble import RandomForestClassifier + +EARTH_RADIUS = 6371000 + class ForestClassifier(eamuu.TripModel): def __init__(self,config=None): - # expected_keys = [ - # 'metric', - # 'similarity_threshold_meters', - # 'apply_cutoff', - # 'incremental_evaluation' - # ] - # for k in expected_keys: - # if config.get(k) is None: - # msg = f"greedy trip model config missing expected key {k}" - # raise KeyError(msg) if config is None: config = eamtc.get_config_value_or_raise('model_parameters.forest') logging.debug(f'ForestClassifier loaded model config from file') else: logging.debug(f'ForestClassifier using model config argument') + + random_forest_expected_keys = [ + 'loc_feature', + 'n_estimators', + 'criterion', + 'max_depth', + 'min_samples_split', + 'min_samples_leaf', + 'max_features', + 'bootstrap', + ] + cluster_expected_keys= [ + 'radius', + 'size_thresh', + 'purity_thresh', + 'gamma', + 'C', + 'use_start_clusters', + 'use_trip_clusters', + ] + + for k in random_forest_expected_keys: + if config.get(k) is None: + msg = f"forest trip model config missing expected key {k}" + raise KeyError(msg) + if config['loc_feature'] == 'cluster': + for k in cluster_expected_keys: + if config.get(k) is None: + msg = f"cluster trip model config missing expected key {k}" + raise KeyError(msg) + self.loc_feature = config['loc_feature'] self.radius = config['radius'] self.size_thresh = config['size_thresh'] @@ -50,6 +77,21 @@ def __init__(self,config=None): # self.drop_unclustered = drop_unclustered self.use_start_clusters = config['use_start_clusters'] self.use_trip_clusters = config['use_trip_clusters'] + self.base_features = [ + 'duration', + 'distance', + 'start_local_dt_year', + 'start_local_dt_month', + 'start_local_dt_day', + 'start_local_dt_hour', + 'start_local_dt_weekday', + 'end_local_dt_year', # most likely the same as the start year + 'end_local_dt_month', # most likely the same as the start month + 'end_local_dt_day', + 'end_local_dt_hour', + 'end_local_dt_weekday', + ] + self.targets = ['mode_true', 'purpose_true', 'replaced_true'] if self.loc_feature == 'cluster': # clustering algorithm to generate end clusters @@ -119,8 +161,16 @@ def __init__(self,config=None): random_state=self.random_state) - def fit(self,data: List[ecwc.Confirmedtrip],data_df=None): + def fit(self,trips: List[ecwc.Confirmedtrip]): # get location features + logging.debug(f'fit called with {len(trips)} trips') + + unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips)) + if len(unlabeled) > 0: + msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}' + raise Exception(msg) + data_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trips) + if self.loc_feature == 'cluster': # fit clustering model(s) and one-hot encode their indices # TODO: consolidate start/end_cluster_model in a single instance @@ -238,22 +288,196 @@ def fit(self,data: List[ecwc.Confirmedtrip],data_df=None): self.mode_predictor.fit(self.X_mode, self.y_mode) if len(self.X_replaced) > 0: self.replaced_predictor.fit(self.X_replaced, self.y_replaced) - - def predict(self, data: List[float]) -> Tuple[List[Dict], int]: - pass + logging.info(f"Forest model fit to {len(trips)} rows of trip data") - def to_dict(self) -> Dict: - return joblib.dump(self,compress=3) - - def from_dict(self, model: Dict): - pass + def predict(self, trips: List[float]) -> Tuple[List[Dict], int]: + logging.debug(f"forest classifier predict called with {len(trips)} trips") - def is_incremental(self) -> bool: - pass + if len(trips) == 0: + msg = f'model.predict cannot be called with 0 trips' + raise Exception(msg) + + # CONVERT TRIPS TO dataFrame + test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trips) - def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: - pass + self.X_test_for_purpose = self._get_X_test_for_purpose(test_df) + + ######################## + ### make predictions ### + ######################## + # note that we want to use purpose data to aid our mode predictions, + # and use both purpose and mode data to aid our replaced-mode + # predictions + try: + purpose_proba_raw = self.purpose_predictor.predict_proba( + self.X_test_for_purpose) + purpose_proba = pd.DataFrame( + purpose_proba_raw, columns=self.purpose_predictor.classes_) + purpose_pred = purpose_proba.idxmax(axis=1) + + # update X_test with one-hot-encoded purpose predictions to aid + # mode predictor + onehot_purpose_df = self.purpose_enc.transform( + pd.DataFrame(purpose_pred).set_index( + self.X_test_for_purpose.index)) + self.X_test_for_mode = pd.concat( + [self.X_test_for_purpose, onehot_purpose_df], axis=1) + + mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() + + except NotFittedError as e: + # if we can't predict purpose, we can still try to predict mode and + # replaced-mode without one-hot encoding the purpose + + purpose_pred = np.full((len(self.X_test_for_purpose), ), np.nan) + purpose_proba_raw = np.full((len(self.X_test_for_purpose), 1), 0) + purpose_proba = pd.DataFrame(purpose_proba_raw, columns=[np.nan]) + + self.X_test_for_mode = self.X_test_for_purpose + mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() + + mode_pred = mode_proba.idxmax(axis=1) + replaced_pred = replaced_proba.idxmax(axis=1) + + if (purpose_pred.dtype == np.float64 and mode_pred.dtype == np.float64 + and replaced_pred.dtype == np.float64): + # this indicates that all the predictions are np.nan so none of the + # random forest classifiers were fitted + raise NotFittedError + + proba_dfs = [] + for label_type, proba in zip( + ['purpose', 'mode', 'replaced'], + [purpose_proba, mode_proba, replaced_proba]): + proba['top_pred'] = proba.idxmax(axis=1) + proba['top_proba'] = proba.max(axis=1, skipna=True) + proba['clusterable'] = self._clusterable( + self.X_test_for_purpose).astype(bool) + proba = pd.concat([proba], keys=[label_type], axis=1) + proba_dfs += [proba] + + self.proba_df = pd.concat(proba_dfs, axis=1) + return self.proba_df + + def _try_predict_proba_mode_replaced(self): + """ Try to predict mode and replaced-mode. Handles error in case the + ensemble algorithms were not fitted. + + Requires self.X_test_for_mode to have already been set. (These are + the DataFrames containing the test data to be passed into self. + mode_predictor.) + + Returns: mode_proba and replaced_proba, two DataFrames containing + class probabilities for mode and replaced-mode respectively + """ + + try: + # predict mode + mode_proba_raw = self.mode_predictor.predict_proba( + self.X_test_for_mode) + mode_proba = pd.DataFrame(mode_proba_raw, + columns=self.mode_predictor.classes_) + mode_pred = mode_proba.idxmax(axis=1) + + # update X_test with one-hot-encoded mode predictions to aid + # replaced-mode predictor + onehot_mode_df = self.mode_enc.transform( + pd.DataFrame(mode_pred).set_index(self.X_test_for_mode.index)) + self.X_test_for_replaced = pd.concat( + [self.X_test_for_mode, onehot_mode_df], axis=1) + replaced_proba = self._try_predict_proba_replaced() + + except NotFittedError as e: + mode_proba_raw = np.full((len(self.X_test_for_mode), 1), 0) + mode_proba = pd.DataFrame(mode_proba_raw, columns=[np.nan]) + + # if we don't have mode predictions, we *could* still try to + # predict replaced mode (but if the user didn't input mode labels + # then it's unlikely they would input replaced-mode) + self.X_test_for_replaced = self.X_test_for_mode + replaced_proba = self._try_predict_proba_replaced() + + return mode_proba, replaced_proba + + def _get_X_test_for_purpose(self, test_df): + """ Do the pre-processing to get data that we can then pass into the + ensemble classifiers. + """ + if self.loc_feature == 'cluster': + # get clusters + self.end_cluster_model.predict(test_df) + clusters_to_encode = self.end_cluster_model.test_df[[ + 'end_cluster_idx' + ]].copy() # copy is to avoid SettingWithCopyWarning + + if self.use_start_clusters or self.use_trip_clusters: + self.start_cluster_model.predict(test_df) + + if self.use_start_clusters: + clusters_to_encode = pd.concat([ + clusters_to_encode, + self.start_cluster_model.test_df[['start_cluster_idx']] + ], + axis=1) + if self.use_trip_clusters: + start_end_clusters = pd.concat([ + self.end_cluster_model.test_df[['end_cluster_idx']], + self.start_cluster_model.test_df[['start_cluster_idx']] + ], + axis=1) + trip_cluster_idx = self.trip_grouper.transform( + start_end_clusters) + clusters_to_encode.loc[:, + 'trip_cluster_idx'] = trip_cluster_idx + + # one-hot encode the cluster indices + loc_features_df = self.cluster_enc.transform(clusters_to_encode) + else: # self.loc_feature == 'coordinates' + test_df = self._clean_data(test_df) + loc_features_df = test_df[[ + 'start_lon', 'start_lat', 'end_lon', 'end_lat' + ]] + + # extract the desired data + X_test = pd.concat([ + test_df[self.base_features].reset_index(drop=True), + loc_features_df.reset_index(drop=True) + ], + axis=1) + return X_test + + + def _clusterable(self, test_df): + """ Check if the end points can be clustered (i.e. are within + meters of an end point from the training set) + """ + if self.loc_feature == 'cluster': + return self.end_cluster_model.test_df.end_cluster_idx >= 0 + + n_samples = test_df.shape[0] + clustered = np.ones(shape=n_samples, dtype=int) * False + + train_coordinates = self.train_df[['end_lat', 'end_lon']] + train_radians = np.radians(train_coordinates) + + for idx, row in test_df.reset_index(drop=True).iterrows(): + # calculate the distances between the ith test data and all points, + # then find the minimum distance for each point and check if it's + # within the distance threshold. + # unfortunately, pairwise_distances_argmin() does not support + # haversine distance, so we have to reimplement it ourselves + new_loc_radians = np.radians(row[["end_lat", "end_lon"]].to_list()) + new_loc_radians = np.reshape(new_loc_radians, (1, 2)) + dist_matrix_meters = haversine_distances( + new_loc_radians, train_radians) * EARTH_RADIUS + + shortest_dist = np.min(dist_matrix_meters) + if shortest_dist < self.radius: + clustered[idx] = True + + return clustered + def _clean_data(self, df): """ Clean a dataframe of trips. (Drop trips with missing start/end locations, expand the user input @@ -332,4 +556,10 @@ def expand_coords(exp_df, purpose=None): dfs.append(df) # display.display(end_loc_df.head()) - return pd.concat(dfs, axis=1) \ No newline at end of file + return pd.concat(dfs, axis=1) + + def to_dict(self) -> Dict: + return joblib.dump(self,compress=3) + + def from_dict(self, model: Dict): + pass diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 4e9ee6fad..a19f5e5c0 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -128,7 +128,7 @@ class label to apply: self.bins: Dict[str, Dict] = {} - def fit(self, trips: List[ecwc.Confirmedtrip],tripsdf=None): + def fit(self, trips: List[ecwc.Confirmedtrip]): """train the model by passing data, where each row in the data corresponds to a label at the matching index of the label input diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index 45b524f9a..63f1f2ef0 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -56,10 +56,7 @@ def update_trip_model( logging.debug(f'model type {model_type.name} is incremental? {model.is_incremental}') logging.debug(f'time query for training data collection: {time_query}') - ts = esta.TimeSeries.get_time_series(user_id) trips = _get_training_data(user_id, time_query) - - trips_df = ts.to_data_df("analysis/confirmed_trip",trips) # don't start training for a user that doesn't have at least $trips many trips # (assume if a stored model exists for the user, that they met this requirement previously) if len(trips) == 0: @@ -77,7 +74,7 @@ def update_trip_model( # train and store the model. pass both List of event and dataframe time data # that both standard( which mostly work on df) and self implemented models can use. - model.fit(trips,trips_df) + model.fit(trips) model_data_next = model.to_dict() if len(model_data_next) == 0: From 5b2572e53e5611691c5117333ca153b5202ba895 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Mon, 9 Oct 2023 17:01:18 -0400 Subject: [PATCH 10/28] [NOT TESTED] Model storage and Model Testing included Model loading and storing is now improved since it just stores the required predictors and encoders. Regression test and null value test included in tests. --- .../modelling/trip_model/forest_classifier.py | 41 ++++- .../tests/modellingTests/TestForestModel.py | 172 ++++++++++++++++++ .../modellingTests/TestRunForestModel.py | 18 +- 3 files changed, 213 insertions(+), 18 deletions(-) create mode 100644 emission/tests/modellingTests/TestForestModel.py diff --git a/emission/analysis/modelling/trip_model/forest_classifier.py b/emission/analysis/modelling/trip_model/forest_classifier.py index 8041ecc44..a76d03628 100644 --- a/emission/analysis/modelling/trip_model/forest_classifier.py +++ b/emission/analysis/modelling/trip_model/forest_classifier.py @@ -558,8 +558,39 @@ def expand_coords(exp_df, purpose=None): # display.display(end_loc_df.head()) return pd.concat(dfs, axis=1) - def to_dict(self) -> Dict: - return joblib.dump(self,compress=3) - - def from_dict(self, model: Dict): - pass +def to_dict(self): + """ + Convert the model to a dictionary suitable for storage. + """ + data = { + 'purpose_predictor': joblib.dumps(self.purpose_predictor).hex(), + 'mode_predictor': joblib.dumps(self.mode_predictor).hex(), + 'replaced_predictor': joblib.dumps(self.replaced_predictor).hex(), + 'cluster_enc': joblib.dumps(self.cluster_enc).hex(), + 'purpose_enc': joblib.dumps(self.purpose_enc).hex(), + 'mode_enc': joblib.dumps(self.mode_enc).hex(), + } + + if self.loc_feature == 'cluster': + data.update({ + 'end_cluster_model' : joblib.dumps(self.end_cluster_model).hex(), + 'start_cluster_model': joblib.dumps(self.start_cluster_model).hex(), + 'trip_grouper': joblib.dumps(self.trip_grouper).hex()}) + + return data + +def from_dict(self, model_data: Dict): + """ + Load the model from a dictionary. + """ + self.purpose_predictor = joblib.loads(bytes.fromhex(model_data['purpose_predictor'])) + self.mode_predictor = joblib.loads(bytes.fromhex(model_data['mode_predictor'])) + self.replaced_predictor = joblib.loads(bytes.fromhex(model_data['replaced_predictor'])) + self.cluster_enc = joblib.loads(bytes.fromhex(model_data['cluster_enc'])) + self.purpose_enc = joblib.loads(bytes.fromhex(model_data['purpose_enc'])) + self.mode_enc = joblib.loads(bytes.fromhex(model_data['mode_enc'])) + if self.loc_feature == 'cluster': + self.end_cluster_model = joblib.loads(bytes.fromhex(model_data['end_cluster_model'])) + self.start_cluster_model = joblib.loads(bytes.fromhex(model_data['start_cluster_model'])) + self.trip_grouper = joblib.loads(bytes.fromhex(model_data['trip_grouper'])) + diff --git a/emission/tests/modellingTests/TestForestModel.py b/emission/tests/modellingTests/TestForestModel.py new file mode 100644 index 000000000..f477f1ab9 --- /dev/null +++ b/emission/tests/modellingTests/TestForestModel.py @@ -0,0 +1,172 @@ +import unittest +import logging + +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.run_model as eamur +import emission.storage.timeseries.abstract_timeseries as esta +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.core.get_database as edb +import emission.storage.pipeline_queries as epq +import emission.core.wrapper.pipelinestate as ecwp +import numpy as np + +class TestRunForestModel(unittest.TestCase): + """these tests were copied forward during a refactor of the tour model + [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] + + it's uncertain what condition they are in besides having been refactored to + use the more recent tour modeling code. + """ + + def setUp(self): + """ + sets up the end-to-end run model test with Confirmedtrip data + """ + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + # configuration for randomly-generated test data + self.user_id = user_id = 'TestRunForestModel-TestData' + self.origin = (-105.1705977, 39.7402654,) + self.destination = (-105.1755606, 39.7673075) + self.min_trips = 14 + self.total_trips = 100 + self.clustered_trips = 33 # must have at least self.min_trips similar trips by default + self.has_label_percent = 0.9 # let's make a few that don't have a label, but invariant + # $clustered_trips * $has_label_percent > self.min_trips + # must be correct or else this test could fail under some random test cases. + + # for a negative test, below + self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' + + # test data can be saved between test invocations, check if data exists before generating + ts = esta.TimeSeries.get_time_series(user_id) + test_data = list(ts.find_entries(["analysis/confirmed_trip"])) + if len(test_data) == 0: + # generate test data for the database + logging.debug(f"inserting mock Confirmedtrips into database") + + # generate labels with a known sample weight that we can rely on in the test + label_data = { + "mode_confirm": ['ebike', 'bike'], + "purpose_confirm": ['happy-hour', 'dog-park'], + "replaced_mode": ['walk'], + "mode_weights": [0.9, 0.1], + "purpose_weights": [0.1, 0.9] + } + + train = etmm.generate_mock_trips( + user_id=user_id, + trips=self.total_trips, + origin=self.origin, + destination=self.destination, + trip_part='od', + label_data=label_data, + within_threshold=self.clustered_trips, + threshold=0.004, # ~400m + has_label_p=self.has_label_percent + ) + + ts.bulk_insert(train) + + # confirm data write did not fail + test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) + if len(test_data) != self.total_trips: + logging.debug(f'test invariant failed after generating test data') + self.fail() + else: + logging.debug(f'found {self.total_trips} trips in database') + + def tearDown(self): + """ + clean up database + """ + edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id}) + edb.get_model_db().delete_many({'user_id': self.user_id}) + edb.get_pipeline_state_db().delete_many({'user_id': self.user_id}) + + +# def test_model_consistency(self): +# """ +# Test to ensure that the model's predictions on the mock data remain consistent. +# """ +# # Get the mock data from the parent class's setup +# mock_data = self.mock_data + +# # Predict using the model +# current_predictions = eamur.predict_labels_with_n( +# trip=mock_data, +# model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, +# model_storage=eamums.ModelStorage.DOCUMENT_DATABASE +# ) # assuming this is how you get predictions +# ## TODO : +# # Check if there are any previously stored predictions +# stored_predictions = list(self.collection.find({})) + +# if len(stored_predictions) == 0: +# # If not, store the current predictions as the ground truth +# self.collection.insert_many([{"index": i, "prediction": p} for i, p in enumerate(current_predictions)]) +# logging.debug("Stored current model predictions as ground truth.") +# else: +# # If there are stored predictions, compare them with the current predictions +# for stored_pred in stored_predictions: +# index, stored_value = stored_pred["index"], stored_pred["prediction"] +# current_value = current_predictions[index] + +# self.assertEqual(stored_value, current_value, f"Prediction at index {index} has changed! Expected {stored_value}, but got {current_value}.") + +# logging.debug("Model predictions are consistent with previously stored predictions.") + + + def test_regression(self): + """ + Regression test to ensure consistent model results. + """ + # Load the previously stored predictions (if any) + previous_predictions = self.load_previous_predictions() + + # Run the current model to get predictions + current_predictions = self.run_current_model() + + # If there are no previous predictions, store the current predictions + if previous_predictions is None: + self.store_predictions(current_predictions) + else: + # Compare the current predictions with the previous predictions + self.assertPredictionsMatch(previous_predictions, current_predictions) + + def load_previous_predictions(self): + # Retrieve stored predictions from the database + # Using get_analysis_timeseries_db as an example, replace with the correct method if needed + db = edb.get_analysis_timeseries_db() + predictions = db.find_one({"user_id": self.user_id, "metadata.key": "predictions"}) + return predictions + + def run_current_model(self): + # Placeholder: Run the current model and get predictions + # Replace this with the actual model running code + predictions = None + return predictions + + def store_predictions(self, predictions): + # Store the predictions in the database + # Using get_analysis_timeseries_db as an example, replace with the correct method if needed + db = edb.get_analysis_timeseries_db() + entry = { + "user_id": self.user_id, + "metadata": { + "key": "predictions", + "write_ts": pd.Timestamp.now().timestamp() # Using pandas timestamp as an example + }, + "data": predictions + } + db.insert_one(entry) + + def assertPredictionsMatch(self, prev, curr): + # Placeholder: Check if the predictions match + # This will depend on the format and type of your predictions + # For example, if predictions are lists or arrays, you can use numpy + if not np.array_equal(prev, curr): + self.fail("Current model predictions do not match previously stored predictions!") diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py index b668c22b3..382ef4074 100644 --- a/emission/tests/modellingTests/TestRunForestModel.py +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -139,20 +139,13 @@ def testTrainForestModelWithZeroTrips(self): pipeline_state['curr_run_ts'], "pipeline should not have a current timestamp for the test user") -# TODO :complete this test once prediction part is done -''' - def test1RoundTripGreedySimilarityBinning(self): - """ - train a model, save it, load it, and use it for prediction, using - the high-level training/testing API provided via - run_model.py:update_trip_model() # train - run_model.py:predict_labels_with_n() # test - - for clustering, use the default greedy similarity binning model - """ + def testPredictForestModelWithZeroTrips(self): + """ + forest model takes config arguments via the constructor for testing + purposes but will load from a file in /conf/analysis/ which is tested here + """ - # pass along debug model configuration forest_model_config= { "loc_feature" : "coordinates", "radius": 500, @@ -197,4 +190,3 @@ def test1RoundTripGreedySimilarityBinning(self): [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)] self.assertNotEqual(len(prediction), 0, "should have a prediction") -''' From bf7f406beb4ab56c25fa8596cb3d03e56570e9ce Mon Sep 17 00:00:00 2001 From: $aTyam Date: Thu, 2 Nov 2023 11:09:52 -0400 Subject: [PATCH 11/28] [TESTED]Forest Model Integration 1. switching a model is as simple as changing model_type in config file 2. ForestModel is now working. Main model is in model.py file which is copied from label_assist 3. TestRunForestModel.py is working. 3. Regression test in TestForestmodel.py are still under construction. --- .../modelling/trip_model/forest_classifier.py | 629 ++------- .../modelling/trip_model/model_type.py | 17 +- .../analysis/modelling/trip_model/models.py | 1194 +++++++++++++++++ emission/core/wrapper/entry.py | 3 + .../tests/modellingTests/TestForestModel.py | 101 +- .../modellingTests/TestRunForestModel.py | 12 +- .../modellingTests/modellingTestAssets.py | 5 +- 7 files changed, 1369 insertions(+), 592 deletions(-) create mode 100644 emission/analysis/modelling/trip_model/models.py diff --git a/emission/analysis/modelling/trip_model/forest_classifier.py b/emission/analysis/modelling/trip_model/forest_classifier.py index a76d03628..5a23d867e 100644 --- a/emission/analysis/modelling/trip_model/forest_classifier.py +++ b/emission/analysis/modelling/trip_model/forest_classifier.py @@ -5,17 +5,13 @@ from sklearn.metrics.pairwise import haversine_distances import emission.core.wrapper.confirmedtrip as ecwc import logging -import numpy as np -import copy +from io import BytesIO import emission.analysis.modelling.trip_model.trip_model as eamuu -import emission.analysis.modelling.trip_model.dbscan_svm as eamtd -import emission.analysis.modelling.trip_model.util as eamtu import emission.analysis.modelling.trip_model.config as eamtc import emission.storage.timeseries.builtin_timeseries as estb -from sklearn.exceptions import NotFittedError - -from sklearn.ensemble import RandomForestClassifier +import emission.storage.decorations.trip_queries as esdtq +from emission.analysis.modelling.trip_model.models import ForestClassifierModel EARTH_RADIUS = 6371000 @@ -33,7 +29,6 @@ def __init__(self,config=None): 'loc_feature', 'n_estimators', 'criterion', - 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features', @@ -59,538 +54,124 @@ def __init__(self,config=None): if config.get(k) is None: msg = f"cluster trip model config missing expected key {k}" raise KeyError(msg) - - self.loc_feature = config['loc_feature'] - self.radius = config['radius'] - self.size_thresh = config['size_thresh'] - self.purity_thresh = config['purity_thresh'] - self.gamma = config['gamma'] - self.C = config['C'] - self.n_estimators = config['n_estimators'] - self.criterion =config['criterion'] - self.max_depth = config['max_depth'] if config['max_depth'] != 'null' else None - self.min_samples_split = config['min_samples_split'] - self.min_samples_leaf = config['min_samples_leaf'] - self.max_features = config['max_features'] - self.bootstrap = config['bootstrap'] - self.random_state = config['random_state'] - # self.drop_unclustered = drop_unclustered - self.use_start_clusters = config['use_start_clusters'] - self.use_trip_clusters = config['use_trip_clusters'] - self.base_features = [ - 'duration', - 'distance', - 'start_local_dt_year', - 'start_local_dt_month', - 'start_local_dt_day', - 'start_local_dt_hour', - 'start_local_dt_weekday', - 'end_local_dt_year', # most likely the same as the start year - 'end_local_dt_month', # most likely the same as the start month - 'end_local_dt_day', - 'end_local_dt_hour', - 'end_local_dt_weekday', - ] - self.targets = ['mode_true', 'purpose_true', 'replaced_true'] - - if self.loc_feature == 'cluster': - # clustering algorithm to generate end clusters - self.end_cluster_model = eamtd.DBSCANSVMCluster( - loc_type='end', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_start_clusters or self.use_trip_clusters: - # clustering algorithm to generate start clusters - self.start_cluster_model = eamtd.DBSCANSVMCluster( - loc_type='start', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_trip_clusters: - # helper class to generate trip-level clusters - self.trip_grouper = eamtd.TripGrouper( - start_cluster_col='start_cluster_idx', - end_cluster_col='end_cluster_idx') - - # wrapper class to generate one-hot encodings for cluster indices - self.cluster_enc = eamtu.OneHotWrapper(sparse=False, - handle_unknown='ignore') - - # wrapper class to generate one-hot encodings for purposes and modes - self.purpose_enc = eamtu.OneHotWrapper(impute_missing=True, - sparse=False, - handle_unknown='error') - self.mode_enc = eamtu.OneHotWrapper(impute_missing=True, - sparse=False, - handle_unknown='error') - - # ensemble classifiers for each label category - self.purpose_predictor = RandomForestClassifier( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - bootstrap=self.bootstrap, - random_state=self.random_state) - self.mode_predictor = RandomForestClassifier( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - bootstrap=self.bootstrap, - random_state=self.random_state) - self.replaced_predictor = RandomForestClassifier( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - bootstrap=self.bootstrap, - random_state=self.random_state) + self.model=ForestClassifierModel(config=config) def fit(self,trips: List[ecwc.Confirmedtrip]): - # get location features + ''' + trips : List of Entry type data + ''' + # check and raise exception if no data to fit logging.debug(f'fit called with {len(trips)} trips') unlabeled = list(filter(lambda t: len(t['data']['user_input']) == 0, trips)) if len(unlabeled) > 0: msg = f'model.fit cannot be called with unlabeled trips, found {len(unlabeled)}' - raise Exception(msg) - data_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trips) - - if self.loc_feature == 'cluster': - # fit clustering model(s) and one-hot encode their indices - # TODO: consolidate start/end_cluster_model in a single instance - # that has a location_type parameter in the fit() method - self.end_cluster_model.fit(data_df) - - clusters_to_encode = self.end_cluster_model.train_df[[ - 'end_cluster_idx' - ]].copy() # copy is to avoid SettingWithCopyWarning - - if self.use_start_clusters or self.use_trip_clusters: - self.start_cluster_model.fit(data_df) - - if self.use_start_clusters: - clusters_to_encode = pd.concat([ - clusters_to_encode, - self.start_cluster_model.train_df[['start_cluster_idx']] - ], - axis=1) - if self.use_trip_clusters: - start_end_clusters = pd.concat([ - self.end_cluster_model.train_df[['end_cluster_idx']], - self.start_cluster_model.train_df[['start_cluster_idx']] - ], - axis=1) - trip_cluster_idx = self.trip_grouper.fit_transform( - start_end_clusters) - clusters_to_encode.loc[:, - 'trip_cluster_idx'] = trip_cluster_idx - - loc_features_df = self.cluster_enc.fit_transform( - clusters_to_encode.astype(int)) - - # clean the df again because we need it in the next step - # TODO: remove redundancy - self.train_df = self._clean_data(data_df) - - # TODO: move below code into a reusable function - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - else: # self.loc_feature == 'coordinates' - self.train_df = self._clean_data(data_df) - - # TODO: move below code into a reusable function - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - loc_features_df = self.train_df[[ - 'start_lon', 'start_lat', 'end_lon', 'end_lat' - ]] - - # prepare data for the ensemble classifiers - - # note that we want to use purpose data to aid our mode predictions, - # and use both purpose and mode data to aid our replaced-mode - # predictions - # thus, we want to one-hot encode the purpose and mode as data - # features, but also preserve an unencoded copy for the target columns - - # dataframe holding all features and targets - self.Xy_train = pd.concat( - [self.train_df[self.base_features + self.targets], loc_features_df], - axis=1) - - # encode purposes and modes - onehot_purpose_df = self.purpose_enc.fit_transform( - self.Xy_train[['purpose_true']], output_col_prefix='purpose') - onehot_mode_df = self.mode_enc.fit_transform( - self.Xy_train[['mode_true']], output_col_prefix='mode') - self.Xy_train = pd.concat( - [self.Xy_train, onehot_purpose_df, onehot_mode_df], axis=1) - - # for predicting purpose, drop encoded purpose and mode features, as - # well as all target labels - self.X_purpose = self.Xy_train.dropna(subset=['purpose_true']).drop( - labels=self.targets + self.purpose_enc.onehot_encoding_cols + - self.mode_enc.onehot_encoding_cols, - axis=1) - - # for predicting mode, we want to keep purpose data - self.X_mode = self.Xy_train.dropna(subset=['mode_true']).drop( - labels=self.targets + self.mode_enc.onehot_encoding_cols, axis=1) - - # for predicting replaced-mode, we want to keep purpose and mode data - self.X_replaced = self.Xy_train.dropna(subset=['replaced_true']).drop( - labels=self.targets, axis=1) - - self.y_purpose = self.Xy_train['purpose_true'].dropna() - self.y_mode = self.Xy_train['mode_true'].dropna() - self.y_replaced = self.Xy_train['replaced_true'].dropna() - - # fit classifiers - if len(self.X_purpose) > 0: - self.purpose_predictor.fit(self.X_purpose, self.y_purpose) - if len(self.X_mode) > 0: - self.mode_predictor.fit(self.X_mode, self.y_mode) - if len(self.X_replaced) > 0: - self.replaced_predictor.fit(self.X_replaced, self.y_replaced) - logging.info(f"Forest model fit to {len(trips)} rows of trip data") - - def predict(self, trips: List[float]) -> Tuple[List[Dict], int]: - logging.debug(f"forest classifier predict called with {len(trips)} trips") - - if len(trips) == 0: - msg = f'model.predict cannot be called with 0 trips' raise Exception(msg) - # CONVERT TRIPS TO dataFrame - test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trips) - - self.X_test_for_purpose = self._get_X_test_for_purpose(test_df) - - ######################## - ### make predictions ### - ######################## - # note that we want to use purpose data to aid our mode predictions, - # and use both purpose and mode data to aid our replaced-mode - # predictions - try: - purpose_proba_raw = self.purpose_predictor.predict_proba( - self.X_test_for_purpose) - purpose_proba = pd.DataFrame( - purpose_proba_raw, columns=self.purpose_predictor.classes_) - purpose_pred = purpose_proba.idxmax(axis=1) - - # update X_test with one-hot-encoded purpose predictions to aid - # mode predictor - onehot_purpose_df = self.purpose_enc.transform( - pd.DataFrame(purpose_pred).set_index( - self.X_test_for_purpose.index)) - self.X_test_for_mode = pd.concat( - [self.X_test_for_purpose, onehot_purpose_df], axis=1) - - mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() - - except NotFittedError as e: - # if we can't predict purpose, we can still try to predict mode and - # replaced-mode without one-hot encoding the purpose - - purpose_pred = np.full((len(self.X_test_for_purpose), ), np.nan) - purpose_proba_raw = np.full((len(self.X_test_for_purpose), 1), 0) - purpose_proba = pd.DataFrame(purpose_proba_raw, columns=[np.nan]) - - self.X_test_for_mode = self.X_test_for_purpose - mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() - - mode_pred = mode_proba.idxmax(axis=1) - replaced_pred = replaced_proba.idxmax(axis=1) - - if (purpose_pred.dtype == np.float64 and mode_pred.dtype == np.float64 - and replaced_pred.dtype == np.float64): - # this indicates that all the predictions are np.nan so none of the - # random forest classifiers were fitted - raise NotFittedError - - proba_dfs = [] - for label_type, proba in zip( - ['purpose', 'mode', 'replaced'], - [purpose_proba, mode_proba, replaced_proba]): - proba['top_pred'] = proba.idxmax(axis=1) - proba['top_proba'] = proba.max(axis=1, skipna=True) - proba['clusterable'] = self._clusterable( - self.X_test_for_purpose).astype(bool) - proba = pd.concat([proba], keys=[label_type], axis=1) - proba_dfs += [proba] - - self.proba_df = pd.concat(proba_dfs, axis=1) - return self.proba_df - - def _try_predict_proba_mode_replaced(self): - """ Try to predict mode and replaced-mode. Handles error in case the - ensemble algorithms were not fitted. + #Convert List of Entry to dataframe + data_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trips) + labeled_trip_df = esdtq.filter_labeled_trips(data_df) + expanded_labeled_trip_df= esdtq.expand_userinputs(labeled_trip_df) + #fit models on dataframe + self.model.fit(expanded_labeled_trip_df) + + + def predict(self, trip: List[float]) -> Tuple[List[Dict], int]: + ''' + trip : A single trip whose mode, pupose and replaced mode are required + returns. + ''' + + #check if theres no trip to predict + logging.debug(f"forest classifier predict called with {len(trip)} trips") + if len(trip) == 0: + msg = f'model.predict cannot be called with an empty trips' + raise Exception(msg) + # CONVERT LIST OF TRIPS TO dataFrame + test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",[trip]) + labeled_trip_df = esdtq.filter_labeled_trips(test_df) + expanded_labeled_trip_df= esdtq.expand_userinputs(labeled_trip_df) + predcitions_df= self.model.predict(expanded_labeled_trip_df) + + # the predictions_df currently holds the highest probable options + # individually in all three categories. the predictions_df are in the form + # + # purpose_pred | purpose_proba | mode_pred | mode_proba | replaced_pred | replaced proba + # dog-park | 1.0 | e-bike | 0.99 | walk | 1.1 + # + # + # However, to keep the trip model general, the forest model is expected to return + # + #PREDICTIONS [ {'labels': {'mode_confirm': 'e-bike', 'replaced_mode': 'walk', 'purpose_confirm': 'dog-park'}, + # 'p': ( Currently average of the 3 probabilities)}] + labels= { + 'mode_confirm': predcitions_df['mode_pred'].iloc[0], + 'replaced_mode' : predcitions_df['replaced_pred'].iloc[0], + 'purpose_confirm' : predcitions_df['purpose_pred'].iloc[0] + } - Requires self.X_test_for_mode to have already been set. (These are - the DataFrames containing the test data to be passed into self. - mode_predictor.) - - Returns: mode_proba and replaced_proba, two DataFrames containing - class probabilities for mode and replaced-mode respectively + avg_proba = predcitions_df[['purpose_proba','mode_proba','replaced_proba']].mean(axis=1).iloc[0] + predictions =[{ + 'labels' : labels, + 'p' : avg_proba + }] + return predictions, len(predictions) + + def to_dict(self): """ - - try: - # predict mode - mode_proba_raw = self.mode_predictor.predict_proba( - self.X_test_for_mode) - mode_proba = pd.DataFrame(mode_proba_raw, - columns=self.mode_predictor.classes_) - mode_pred = mode_proba.idxmax(axis=1) - - # update X_test with one-hot-encoded mode predictions to aid - # replaced-mode predictor - onehot_mode_df = self.mode_enc.transform( - pd.DataFrame(mode_pred).set_index(self.X_test_for_mode.index)) - self.X_test_for_replaced = pd.concat( - [self.X_test_for_mode, onehot_mode_df], axis=1) - replaced_proba = self._try_predict_proba_replaced() - - except NotFittedError as e: - mode_proba_raw = np.full((len(self.X_test_for_mode), 1), 0) - mode_proba = pd.DataFrame(mode_proba_raw, columns=[np.nan]) - - # if we don't have mode predictions, we *could* still try to - # predict replaced mode (but if the user didn't input mode labels - # then it's unlikely they would input replaced-mode) - self.X_test_for_replaced = self.X_test_for_mode - replaced_proba = self._try_predict_proba_replaced() - - return mode_proba, replaced_proba - - def _get_X_test_for_purpose(self, test_df): - """ Do the pre-processing to get data that we can then pass into the - ensemble classifiers. + Convert the model to a dictionary suitable for storage. """ - if self.loc_feature == 'cluster': - # get clusters - self.end_cluster_model.predict(test_df) - clusters_to_encode = self.end_cluster_model.test_df[[ - 'end_cluster_idx' - ]].copy() # copy is to avoid SettingWithCopyWarning - - if self.use_start_clusters or self.use_trip_clusters: - self.start_cluster_model.predict(test_df) - - if self.use_start_clusters: - clusters_to_encode = pd.concat([ - clusters_to_encode, - self.start_cluster_model.test_df[['start_cluster_idx']] - ], - axis=1) - if self.use_trip_clusters: - start_end_clusters = pd.concat([ - self.end_cluster_model.test_df[['end_cluster_idx']], - self.start_cluster_model.test_df[['start_cluster_idx']] - ], - axis=1) - trip_cluster_idx = self.trip_grouper.transform( - start_end_clusters) - clusters_to_encode.loc[:, - 'trip_cluster_idx'] = trip_cluster_idx - - # one-hot encode the cluster indices - loc_features_df = self.cluster_enc.transform(clusters_to_encode) - else: # self.loc_feature == 'coordinates' - test_df = self._clean_data(test_df) - loc_features_df = test_df[[ - 'start_lon', 'start_lat', 'end_lon', 'end_lat' - ]] - - # extract the desired data - X_test = pd.concat([ - test_df[self.base_features].reset_index(drop=True), - loc_features_df.reset_index(drop=True) - ], - axis=1) - - return X_test - - - def _clusterable(self, test_df): - """ Check if the end points can be clustered (i.e. are within - meters of an end point from the training set) + data={} + attr=[ 'purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df'] + if self.model.loc_feature == 'cluster': + ## confirm this includes all the extra encoders/models + attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper']) + for attribute_name in attr: + buffer=BytesIO() + joblib.dump(getattr(self.model,attribute_name),buffer) + buffer.seek(0) + data[attribute_name]=buffer.getvalue() + + return data + + def from_dict(self,model: Dict): """ - if self.loc_feature == 'cluster': - return self.end_cluster_model.test_df.end_cluster_idx >= 0 - - n_samples = test_df.shape[0] - clustered = np.ones(shape=n_samples, dtype=int) * False - - train_coordinates = self.train_df[['end_lat', 'end_lon']] - train_radians = np.radians(train_coordinates) - - for idx, row in test_df.reset_index(drop=True).iterrows(): - # calculate the distances between the ith test data and all points, - # then find the minimum distance for each point and check if it's - # within the distance threshold. - # unfortunately, pairwise_distances_argmin() does not support - # haversine distance, so we have to reimplement it ourselves - new_loc_radians = np.radians(row[["end_lat", "end_lon"]].to_list()) - new_loc_radians = np.reshape(new_loc_radians, (1, 2)) - dist_matrix_meters = haversine_distances( - new_loc_radians, train_radians) * EARTH_RADIUS - - shortest_dist = np.min(dist_matrix_meters) - if shortest_dist < self.radius: - clustered[idx] = True - - return clustered - - def _clean_data(self, df): - """ Clean a dataframe of trips. - (Drop trips with missing start/end locations, expand the user input - columns, ensure all essential columns are present) - - Args: - df: a dataframe of trips. must contain the columns 'start_loc', - 'end_loc', and should also contain the user input columns - ('mode_confirm', 'purpose_confirm', 'replaced_mode') if - available - """ - assert 'start_loc' in df.columns and 'end_loc' in df.columns - - # clean up the dataframe by dropping entries with NaN locations and - # reset index - num_nan = 0 - if df.start_loc.isna().any(): - num_nan += df.start_loc.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['start_loc']) - if df.end_loc.isna().any(): - num_nan += df.end_loc.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['end_loc']) - - # expand the 'start_loc' and 'end_loc' column into 'start_lat', - # 'start_lon', 'end_lat', and 'end_lon' columns - df = self.expand_coords(df) - - # drop trips with missing coordinates - if df.start_lat.isna().any(): - num_nan += df.start_lat.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['start_lat']) - if df.start_lon.isna().any(): - num_nan += df.start_lon.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['start_lon']) - if df.end_lat.isna().any(): - num_nan += df.end_lat.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['end_lat']) - if df.end_lon.isna().any(): - num_nan = df.end_lon.value_counts(dropna=False).loc[np.nan] - df += df.dropna(subset=['end_lon']) - if num_nan > 0: - logging.info( - f'dropped {num_nan} trips that are missing location coordinates' - ) - - df = df.rename( - columns={ - 'mode_confirm': 'mode_true', - 'purpose_confirm': 'purpose_true', - 'replaced_mode': 'replaced_true' - }) - - for category in ['mode_true', 'purpose_true', 'replaced_true']: - if category not in df.columns: - # for example, if a user labels all their trip modes but none of their trip purposes - df.loc[:, category] = np.nan - - return df.reset_index(drop=True) - - def expand_coords(exp_df, purpose=None): + Load the model from a dictionary. """ - copied and modifed from get_loc_df_for_purpose() in the 'Radius - selection' notebook + attr=[ 'purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df'] + if self.model.loc_feature == 'cluster': + ## TODO : confirm this includes all the extra encoders/models + attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper']) + for attribute_name in attr: + try: + if attribute_name in model: + buffer = BytesIO(model[attribute_name]) + setattr(self.model,attribute_name, joblib.load(buffer)) + except Exception as e: + print(f"Error loading {attribute_name}: {str(e)}") + # If we do not wish to raise the exception after logging the error, comment the line below + raise e + + def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: """ - purpose_trips = exp_df - if purpose is not None: - purpose_trips = exp_df[exp_df.purpose_confirm == purpose] - - dfs = [purpose_trips] - for loc_type in ['start', 'end']: - df = pd.DataFrame( - purpose_trips[loc_type + - "_loc"].apply(lambda p: p["coordinates"]).to_list(), - columns=[loc_type + "_lon", loc_type + "_lat"]) - df = df.set_index(purpose_trips.index) - dfs.append(df) - - # display.display(end_loc_df.head()) - return pd.concat(dfs, axis=1) - -def to_dict(self): - """ - Convert the model to a dictionary suitable for storage. - """ - data = { - 'purpose_predictor': joblib.dumps(self.purpose_predictor).hex(), - 'mode_predictor': joblib.dumps(self.mode_predictor).hex(), - 'replaced_predictor': joblib.dumps(self.replaced_predictor).hex(), - 'cluster_enc': joblib.dumps(self.cluster_enc).hex(), - 'purpose_enc': joblib.dumps(self.purpose_enc).hex(), - 'mode_enc': joblib.dumps(self.mode_enc).hex(), - } - - if self.loc_feature == 'cluster': - data.update({ - 'end_cluster_model' : joblib.dumps(self.end_cluster_model).hex(), - 'start_cluster_model': joblib.dumps(self.start_cluster_model).hex(), - 'trip_grouper': joblib.dumps(self.trip_grouper).hex()}) + extract the relevant features for learning from a trip for this model instance - return data + :param trip: the trip to extract features from + :type trip: Confirmedtrip + :return: a vector containing features to predict from + :rtype: List[float] + """ + pass -def from_dict(self, model_data: Dict): - """ - Load the model from a dictionary. - """ - self.purpose_predictor = joblib.loads(bytes.fromhex(model_data['purpose_predictor'])) - self.mode_predictor = joblib.loads(bytes.fromhex(model_data['mode_predictor'])) - self.replaced_predictor = joblib.loads(bytes.fromhex(model_data['replaced_predictor'])) - self.cluster_enc = joblib.loads(bytes.fromhex(model_data['cluster_enc'])) - self.purpose_enc = joblib.loads(bytes.fromhex(model_data['purpose_enc'])) - self.mode_enc = joblib.loads(bytes.fromhex(model_data['mode_enc'])) - if self.loc_feature == 'cluster': - self.end_cluster_model = joblib.loads(bytes.fromhex(model_data['end_cluster_model'])) - self.start_cluster_model = joblib.loads(bytes.fromhex(model_data['start_cluster_model'])) - self.trip_grouper = joblib.loads(bytes.fromhex(model_data['trip_grouper'])) + def is_incremental(self) -> bool: + """ + whether this model requires the complete user history to build (False), + or, if only the incremental data since last execution is required (True). + :return: if the model is incremental. the current timestamp will be recorded + in the analysis pipeline. the next call to this model will only include + trip data for trips later than the recorded timestamp. + :rtype: bool + """ + pass \ No newline at end of file diff --git a/emission/analysis/modelling/trip_model/model_type.py b/emission/analysis/modelling/trip_model/model_type.py index 2d7e6f743..56268a51a 100644 --- a/emission/analysis/modelling/trip_model/model_type.py +++ b/emission/analysis/modelling/trip_model/model_type.py @@ -26,17 +26,16 @@ def build(self, config=None) -> eamuu.TripModel: :raises KeyError: if the requested model name does not exist """ # Dict[ModelType, TripModel] - MODELS = { - #ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning(config), - ModelType.RANDOM_FOREST_CLASSIFIER: eamuf.ForestClassifier(config) - } + MODELS = { + ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning, + ModelType.RANDOM_FOREST_CLASSIFIER: eamuf.ForestClassifier + } model = MODELS.get(self) if model is None: - model_names = list(lambda e: e.name, MODELS.keys()) - models = ",".join(model_names) - raise KeyError(f"ModelType {self.name} not found in factory, please add to build method") - - return model + available_models = ', '.join([ e.name for e in ModelType]) + raise KeyError(f"ModelType {self.name} not found in factory, Available models are {available_models}."\ + "Otherwise please add new model to build method") + return model(config) @classmethod def names(cls): diff --git a/emission/analysis/modelling/trip_model/models.py b/emission/analysis/modelling/trip_model/models.py new file mode 100644 index 000000000..a8da464c4 --- /dev/null +++ b/emission/analysis/modelling/trip_model/models.py @@ -0,0 +1,1194 @@ +######################################################################## +## Copied from /e-mission-eval-private-data/TRB_label_assist/models.py## +######################################################################## + + + +import pandas as pd +import numpy as np +from abc import ABCMeta, abstractmethod # to define abstract class "blueprints" +import logging +import copy + +# sklearn imports +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.metrics.pairwise import haversine_distances +from sklearn.cluster import DBSCAN +from sklearn import svm +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.exceptions import NotFittedError + +# NOTE: tour_model_extended.similarity is on the +# eval-private-data-compatibility branch in e-mission-server + +# logging.basicConfig(level=logging.DEBUG) + +EARTH_RADIUS = 6371000 + +############################# +## define abstract classes ## +############################# + + +class SetupMixin(metaclass=ABCMeta): + """ class containing code to be reused when setting up estimators. """ + + def _clean_data(self, df): + """ Clean a dataframe of trips. + (Drop trips with missing start/end locations, expand the user input + columns, ensure all essential columns are present) + + Args: + df: a dataframe of trips. must contain the columns 'start_loc', + 'end_loc', and should also contain the user input columns + ('mode_confirm', 'purpose_confirm', 'replaced_mode') if + available + """ + assert 'start_loc' in df.columns and 'end_loc' in df.columns + + # clean up the dataframe by dropping entries with NaN locations and + # reset index + num_nan = 0 + if df.start_loc.isna().any(): + num_nan += df.start_loc.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['start_loc']) + if df.end_loc.isna().any(): + num_nan += df.end_loc.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['end_loc']) + + # expand the 'start_loc' and 'end_loc' column into 'start_lat', + # 'start_lon', 'end_lat', and 'end_lon' columns + df = self.expand_coords(df) + + # drop trips with missing coordinates + if df.start_lat.isna().any(): + num_nan += df.start_lat.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['start_lat']) + if df.start_lon.isna().any(): + num_nan += df.start_lon.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['start_lon']) + if df.end_lat.isna().any(): + num_nan += df.end_lat.value_counts(dropna=False).loc[np.nan] + df = df.dropna(subset=['end_lat']) + if df.end_lon.isna().any(): + num_nan = df.end_lon.value_counts(dropna=False).loc[np.nan] + df += df.dropna(subset=['end_lon']) + if num_nan > 0: + logging.info( + f'dropped {num_nan} trips that are missing location coordinates' + ) + + df = df.rename( + columns={ + 'mode_confirm': 'mode_true', + 'purpose_confirm': 'purpose_true', + 'replaced_mode': 'replaced_true' + }) + + for category in ['mode_true', 'purpose_true', 'replaced_true']: + if category not in df.columns: + # for example, if a user labels all their trip modes but none of their trip purposes + df.loc[:, category] = np.nan + + return df.reset_index(drop=True) + + def expand_coords(self,exp_df, purpose=None): + """ + copied and modifed from get_loc_df_for_purpose() in the 'Radius + selection' notebook + """ + purpose_trips = exp_df + if purpose is not None: + purpose_trips = exp_df[exp_df.purpose_confirm == purpose] + + dfs = [purpose_trips] + for loc_type in ['start', 'end']: + df = pd.DataFrame( + purpose_trips[loc_type + + "_loc"].apply(lambda p: p["coordinates"]).to_list(), + columns=[loc_type + "_lon", loc_type + "_lat"]) + df = df.set_index(purpose_trips.index) + dfs.append(df) + + # display.display(end_loc_df.head()) + return pd.concat(dfs, axis=1) + + +class Cluster(SetupMixin, metaclass=ABCMeta): + """ blueprint for clustering models. """ + + @abstractmethod + def fit(self, train_df,ct_entry=None): + """ Fit the clustering algorithm. + + Args: + train_df (DataFrame): dataframe of labeled trips + ct_entry (List) : A list of Entry type of labeled and unlabeled trips + + Returns: + self + """ + raise NotImplementedError + + @abstractmethod + def predict(self, test_df): + """ Predict cluster indices for trips, if possible. Trips that could + not be clustered will have the index -1. + + Args: + test_df (DataFrame): dataframe of test trips + + Returns: + pd DataFrame containing one column, 'start_cluster_idx' or + 'end_cluster_idx' + """ + raise NotImplementedError + + def fit_predict(self, train_df): + """ Fit the clustering algorithm and predict cluster indices for trips, + if possible. Trips that could not be clustered will have the index -1. + + Args: + train_df (DataFrame): dataframe of labeled trips + + Returns: + pd DataFrame containing one column, 'start_cluster_idx' or + 'end_cluster_idx' + """ + self.fit(train_df) + return self.predict(train_df) + + +class TripClassifier(SetupMixin, metaclass=ABCMeta): + + @abstractmethod + def fit(self, train_df,ct_entry=None): + """ Fit a classification model. + + Args: + train_df (DataFrame): dataframe of labeled trips + ct_entry (List) : A list of Entry type of labeled and unlabeled trips + + Returns: + self + """ + raise NotImplementedError + + def predict(self, test_df): + """ Predict trip labels. + + Args: + test_df (DataFrame): dataframe of trips + + Returns: + DataFrame containing the following columns: + 'purpose_pred', 'mode_pred', 'replaced_pred', + 'purpose_proba', 'mode_proba', 'replaced_proba' + the *_pred columns contain the most-likely label prediction + (string for a label or float for np.nan). + the *_proba columns contain the probability of the most-likely + prediction. + """ + proba_df = self.predict_proba(test_df) + prediction_df = proba_df.loc[:, [('purpose', 'top_pred'), + ('purpose', 'top_proba'), + ('mode', 'top_pred'), + ('mode', 'top_proba'), + ('replaced', 'top_pred'), + ('replaced', 'top_proba')]] + + prediction_df.columns = prediction_df.columns.to_flat_index() + prediction_df = prediction_df.rename( + columns={ + ('purpose', 'top_pred'): 'purpose_pred', + ('purpose', 'top_proba'): 'purpose_proba', + ('mode', 'top_pred'): 'mode_pred', + ('mode', 'top_proba'): 'mode_proba', + ('replaced', 'top_pred'): 'replaced_pred', + ('replaced', 'top_proba'): 'replaced_proba', + }) + + return prediction_df + + def fit_predict(self, train_df): + """ Fit a classification model and predict trip labels. + + Args: + train_df (DataFrame): dataframe of labeled trips + + Returns: + DataFrame containing the following columns: + 'purpose_pred', 'mode_pred', 'replaced_pred', + 'purpose_proba', 'mode_proba', 'replaced_proba' + the *_pred columns contain the most-likely label prediction + (string for a label or float for np.nan). + the *_proba columns contain the probability of the most-likely + prediction. + """ + self.fit(train_df) + return self.predict(train_df) + + @abstractmethod + def predict_proba(self, test_df): + """ Predict class probabilities for each trip. + + NOTE: check the specific model to see if the class probabilities + have confidence-discounting or not. + + Args: + test_df (DataFrame): dataframe of trips + + Returns: + DataFrame with multiindexing. Each row represents a trip. There + are 3 columns at level 1, one for each label category + ('purpose', 'mode', 'replaced'). Within each category, there is + a column for each label, with the row's entry being the + probability that the trip has the label. There are three + additional columns within each category, one indicating the + most-likely label, one indicating the probability of the + most-likely label, and one indicating whether or not the trip + can be clustered. + TODO: add a fourth optional column for the number of trips in + the cluster (if clusterable) + + Level 1 columns are: purpose, mode, replaced + Lebel 2 columns are: + , , ... top_pred, top_proba, clusterable + , , ... top_pred, top_proba, clusterable + , , ... top_pred, top_proba, clusterable + """ + raise NotImplementedError + + +class DBSCANSVMCluster(Cluster): + """ DBSCAN-based clustering algorithm that optionally implements SVM + sub-clustering. + + Args: + loc_type (str): 'start' or 'end', the type of point to cluster + radius (int): max distance between two points in each other's + neighborhood, i.e. DBSCAN's eps value. does not strictly + dictate final cluster size + size_thresh (int): the min number of trips a cluster must have + to be considered for SVM sub-division + purity_thresh (float): the min purity a cluster must have + to be sub-divided using SVM + gamma (float): coefficient for the rbf kernel in SVM + C (float): regularization hyperparameter for SVM + + Attributes: + loc_type (str) + radius (int) + size_thresh (int) + purity_thresh (float) + gamma (float) + C (float) + train_df (DataFrame) + test_df (DataFrame) + base_model (sklearn Estimator) + """ + + def __init__(self, + loc_type='end', + radius=100, + svm=True, + size_thresh=1, + purity_thresh=1.0, + gamma=0.05, + C=1): + logging.info("PERF: Initializing DBSCANSVMCluster") + self.loc_type = loc_type + self.radius = radius + self.svm = svm + self.size_thresh = size_thresh + self.purity_thresh = purity_thresh + self.gamma = gamma + self.C = C + + def set_params(self, params): + if 'loc_type' in params.keys(): self.loc_type = params['loc_type'] + if 'radius' in params.keys(): self.radius = params['radius'] + if 'svm' in params.keys(): self.svm = params['svm'] + if 'size_thresh' in params.keys(): + self.size_thresh = params['size_thresh'] + if 'purity_thresh' in params.keys(): + self.purity_thresh = params['purity_thresh'] + if 'gamma' in params.keys(): self.gamma = params['gamma'] + + return self + + def fit(self, train_df,ct_entry=None): + """ Creates clusters of trip points. + self.train_df will be updated with columns containing base and + final clusters. + + TODO: perhaps move the loc_type argument to fit() so we can use a + single class instance to cluster both start and end points. This + will also help us reduce duplicate data. + + Args: + train_df (dataframe): dataframe of labeled trips + ct_entry (List) : A list of Entry type of labeled and unlabeled trips + """ + ################## + ### clean data ### + ################## + logging.info("PERF: Fitting DBSCANSVMCluster") + self.train_df = self._clean_data(train_df) + + # we can use all trips as long as they have purpose labels. it's ok if + # they're missing mode/replaced-mode labels, because they aren't as + # strongly correlated with location compared to purpose + # TODO: actually, we may want to rethink this. for example, it will + # probably be helpful to include trips that are missing purpose labels + # but still have mode labels. + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + ######################### + ### get base clusters ### + ######################### + dist_matrix_meters = get_distance_matrix(self.train_df, self.loc_type) + self.base_model = DBSCAN(self.radius, + metric="precomputed", + min_samples=1).fit(dist_matrix_meters) + base_clusters = self.base_model.labels_ + + self.train_df.loc[:, + f'{self.loc_type}_base_cluster_idx'] = base_clusters + + ######################## + ### get sub-clusters ### + ######################## + # copy base cluster column into final cluster column + self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = self.train_df[ + f'{self.loc_type}_base_cluster_idx'] + + if self.svm: + c = 0 # count of how many clusters we have iterated over + + # iterate over all clusters and subdivide them with SVM. the while + # loop is so we can do multiple iterations of subdividing if needed + while c < self.train_df[f'{self.loc_type}_cluster_idx'].max(): + points_in_cluster = self.train_df[ + self.train_df[f'{self.loc_type}_cluster_idx'] == c] + + # only do SVM if we have the minimum num of trips in the cluster + if len(points_in_cluster) < self.size_thresh: + c += 1 + continue + + # only do SVM if purity is below threshold + purity = single_cluster_purity(points_in_cluster, + label_col='purpose_true') + if purity < self.purity_thresh: + X = points_in_cluster[[ + f"{self.loc_type}_lon", f"{self.loc_type}_lat" + ]] + y = points_in_cluster.purpose_true.to_list() + + svm_model = make_pipeline( + StandardScaler(), + svm.SVC( + kernel='rbf', + gamma=self.gamma, + C=self.C, + )).fit(X, y) + labels = svm_model.predict(X) + unique_labels = np.unique(labels) + + # if the SVM predicts that all points in the cluster have + # the same label, just ignore it and don't reindex. + # this also helps us to handle the possibility that a + # cluster may be impure but inherently inseparable, e.g. an + # end cluster at a user's home, containing 50% trips from + # work to home and 50% round trips that start and end at + # home. we don't want to reindex otherwise the low purity + # will trigger SVM again, and we will attempt & fail to + # split the cluster ad infinitum + if len(unique_labels) > 1: + # map purpose labels to new cluster indices + # we offset indices by the max existing index so that we + # don't run into any duplicate indices + max_existing_idx = self.train_df[ + f'{self.loc_type}_cluster_idx'].max() + label_to_cluster = { + unique_labels[i]: i + max_existing_idx + 1 + for i in range(len(unique_labels)) + } + # update trips with their new cluster indices + indices = np.array( + [label_to_cluster[l] for l in labels]) + self.train_df.loc[ + self.train_df[f'{self.loc_type}_cluster_idx'] == c, + f'{self.loc_type}_cluster_idx'] = indices + + c += 1 + # TODO: make things categorical at the end? or maybe at the start of the decision tree pipeline + + return self + + def fit_predict(self, train_df): + """ Override to avoid unnecessarily computation of distance matrices. + """ + self.fit(train_df) + return self.train_df[[f'{self.loc_type}_cluster_idx']] + + def predict(self, test_df): + logging.info("PERF: Predicting DBSCANSVMCluster") + # TODO: store clusters as polygons so the prediction is faster + # TODO: we probably don't want to store test_df in self to be more memory-efficient + self.test_df = self._clean_data(test_df) + pred_clusters = self._NN_predict(self.test_df) + + self.test_df.loc[:, f'{self.loc_type}_cluster_idx'] = pred_clusters + + return self.test_df[[f'{self.loc_type}_cluster_idx']] + + def _NN_predict(self, test_df): + """ Generate base-cluster predictions for the test data using a + nearest-neighbor approach. + + sklearn doesn't implement predict() for DBSCAN, which is why we + need a custom method. + """ + logging.info("PERF: NN_predicting DBSCANSVMCluster") + n_samples = test_df.shape[0] + labels = np.ones(shape=n_samples, dtype=int) * -1 + + # get coordinates of core points (we can't use model.components_ + # because our input feature was a distance matrix and doesn't contain + # info about the raw coordinates) + # NOTE: technically, every single point in a cluster is a core point + # because it has at least minPts (2) points, including itself, in its + # radius + train_coordinates = self.train_df[[ + f'{self.loc_type}_lat', f'{self.loc_type}_lon' + ]] + train_radians = np.radians(train_coordinates) + + for idx, row in test_df.reset_index(drop=True).iterrows(): + # calculate the distances between the ith test data and all points, + # then find the index of the closest point. if the ith test data is + # within epsilon of the point, then assign its cluster to the ith + # test data (otherwise, leave it as -1, indicating noise). + # unfortunately, pairwise_distances_argmin() does not support + # haversine distance, so we have to reimplement it ourselves + new_loc_radians = np.radians( + row[[self.loc_type + "_lat", self.loc_type + "_lon"]].to_list()) + new_loc_radians = np.reshape(new_loc_radians, (1, 2)) + dist_matrix_meters = haversine_distances( + new_loc_radians, train_radians) * EARTH_RADIUS + + shortest_dist_idx = np.argmin(dist_matrix_meters) + if dist_matrix_meters[0, shortest_dist_idx] < self.radius: + labels[idx] = self.train_df.reset_index( + drop=True).loc[shortest_dist_idx, + f'{self.loc_type}_cluster_idx'] + + return labels + + + +class EnsembleClassifier(TripClassifier, metaclass=ABCMeta): + """ Template class for trip classifiers using ensemble algorithms. + + Required args: + loc_feature (str): 'coordinates' or 'cluster' + """ + base_features = [ + 'duration', + 'distance', + 'start_local_dt_year', + 'start_local_dt_month', + 'start_local_dt_day', + 'start_local_dt_hour', + # 'start_local_dt_minute', + 'start_local_dt_weekday', + 'end_local_dt_year', # most likely the same as the start year + 'end_local_dt_month', # most likely the same as the start month + 'end_local_dt_day', + 'end_local_dt_hour', + # 'end_local_dt_minute', + 'end_local_dt_weekday', + ] + targets = ['mode_true', 'purpose_true', 'replaced_true'] + + # required instance attributes + loc_feature = NotImplemented + purpose_enc = NotImplemented + mode_enc = NotImplemented + purpose_predictor = NotImplemented + mode_predictor = NotImplemented + replaced_predictor = NotImplemented + + # required methods + def fit(self, train_df,ct_entry=None): + # get location features + if self.loc_feature == 'cluster': + # fit clustering model(s) and one-hot encode their indices + # TODO: consolidate start/end_cluster_model in a single instance + # that has a location_type parameter in the fit() method + self.end_cluster_model.fit(train_df) + + clusters_to_encode = self.end_cluster_model.train_df[[ + 'end_cluster_idx' + ]].copy() # copy is to avoid SettingWithCopyWarning + + if self.use_start_clusters or self.use_trip_clusters: + self.start_cluster_model.fit(train_df) + + if self.use_start_clusters: + clusters_to_encode = pd.concat([ + clusters_to_encode, + self.start_cluster_model.train_df[['start_cluster_idx']] + ], + axis=1) + if self.use_trip_clusters: + start_end_clusters = pd.concat([ + self.end_cluster_model.train_df[['end_cluster_idx']], + self.start_cluster_model.train_df[['start_cluster_idx']] + ], + axis=1) + trip_cluster_idx = self.trip_grouper.fit_transform( + start_end_clusters) + clusters_to_encode.loc[:, + 'trip_cluster_idx'] = trip_cluster_idx + + loc_features_df = self.cluster_enc.fit_transform( + clusters_to_encode.astype(int)) + + # clean the df again because we need it in the next step + # TODO: remove redundancy + self.train_df = self._clean_data(train_df) + + # TODO: move below code into a reusable function + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + else: # self.loc_feature == 'coordinates' + self.train_df = self._clean_data(train_df) + + # TODO: move below code into a reusable function + if self.train_df.purpose_true.isna().any(): + num_nan = self.train_df.purpose_true.value_counts( + dropna=False).loc[np.nan] + logging.info( + f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' + ) + self.train_df = self.train_df.dropna( + subset=['purpose_true']).reset_index(drop=True) + if len(self.train_df) == 0: + # i.e. no valid trips after removing all nans + raise Exception('no valid trips; nothing to fit') + + loc_features_df = self.train_df[[ + 'start_lon', 'start_lat', 'end_lon', 'end_lat' + ]] + + # prepare data for the ensemble classifiers + + # note that we want to use purpose data to aid our mode predictions, + # and use both purpose and mode data to aid our replaced-mode + # predictions + # thus, we want to one-hot encode the purpose and mode as data + # features, but also preserve an unencoded copy for the target columns + + # dataframe holding all features and targets + self.Xy_train = pd.concat( + [self.train_df[self.base_features + self.targets], loc_features_df], + axis=1) + + # encode purposes and modes + onehot_purpose_df = self.purpose_enc.fit_transform( + self.Xy_train[['purpose_true']], output_col_prefix='purpose') + onehot_mode_df = self.mode_enc.fit_transform( + self.Xy_train[['mode_true']], output_col_prefix='mode') + self.Xy_train = pd.concat( + [self.Xy_train, onehot_purpose_df, onehot_mode_df], axis=1) + + # for predicting purpose, drop encoded purpose and mode features, as + # well as all target labels + self.X_purpose = self.Xy_train.dropna(subset=['purpose_true']).drop( + labels=self.targets + self.purpose_enc.onehot_encoding_cols + + self.mode_enc.onehot_encoding_cols, + axis=1) + + # for predicting mode, we want to keep purpose data + self.X_mode = self.Xy_train.dropna(subset=['mode_true']).drop( + labels=self.targets + self.mode_enc.onehot_encoding_cols, axis=1) + + # for predicting replaced-mode, we want to keep purpose and mode data + self.X_replaced = self.Xy_train.dropna(subset=['replaced_true']).drop( + labels=self.targets, axis=1) + + self.y_purpose = self.Xy_train['purpose_true'].dropna() + self.y_mode = self.Xy_train['mode_true'].dropna() + self.y_replaced = self.Xy_train['replaced_true'].dropna() + + # fit classifiers + if len(self.X_purpose) > 0: + self.purpose_predictor.fit(self.X_purpose, self.y_purpose) + if len(self.X_mode) > 0: + self.mode_predictor.fit(self.X_mode, self.y_mode) + if len(self.X_replaced) > 0: + self.replaced_predictor.fit(self.X_replaced, self.y_replaced) + + return self + + def predict_proba(self, test_df): + """ NOTE: these class probabilities do NOT have a + confidence-discounting heuristic applied. + """ + ################ + ### get data ### + ################ + self.X_test_for_purpose = self._get_X_test_for_purpose(test_df) + + ######################## + ### make predictions ### + ######################## + # note that we want to use purpose data to aid our mode predictions, + # and use both purpose and mode data to aid our replaced-mode + # predictions + + # TODO: some of the code across the try and except blocks can be + # consolidated by considering one-hot encoding fully np.nan arrays + try: + purpose_proba_raw = self.purpose_predictor.predict_proba( + self.X_test_for_purpose) + purpose_proba = pd.DataFrame( + purpose_proba_raw, columns=self.purpose_predictor.classes_) + purpose_pred = purpose_proba.idxmax(axis=1) + + # update X_test with one-hot-encoded purpose predictions to aid + # mode predictor + # TODO: converting purpose_pred to a DataFrame feels super + # unnecessary, make this more efficient + onehot_purpose_df = self.purpose_enc.transform( + pd.DataFrame(purpose_pred).set_index( + self.X_test_for_purpose.index)) + self.X_test_for_mode = pd.concat( + [self.X_test_for_purpose, onehot_purpose_df], axis=1) + + mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() + + except NotFittedError as e: + # if we can't predict purpose, we can still try to predict mode and + # replaced-mode without one-hot encoding the purpose + + purpose_pred = np.full((len(self.X_test_for_purpose), ), np.nan) + purpose_proba_raw = np.full((len(self.X_test_for_purpose), 1), 0) + purpose_proba = pd.DataFrame(purpose_proba_raw, columns=[np.nan]) + + self.X_test_for_mode = self.X_test_for_purpose + mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() + + mode_pred = mode_proba.idxmax(axis=1) + replaced_pred = replaced_proba.idxmax(axis=1) + + if (purpose_pred.dtype == np.float64 and mode_pred.dtype == np.float64 + and replaced_pred.dtype == np.float64): + # this indicates that all the predictions are np.nan so none of the + # random forest classifiers were fitted + raise NotFittedError + + # TODO: move this to a Mixin for cluster-based predictors and use the + # 'cluster' column of the proba_df outputs + # if self.drop_unclustered: + # # TODO: actually, we should only drop purpose predictions. we can + # # then impute the missing entries in the purpose feature and still + # # try to predict mode and replaced-mode without it + # self.predictions.loc[ + # self.end_cluster_model.test_df['end_cluster_idx'] == -1, + # ['purpose_pred', 'mode_pred', 'replaced_pred']] = np.nan + + proba_dfs = [] + for label_type, proba in zip( + ['purpose', 'mode', 'replaced'], + [purpose_proba, mode_proba, replaced_proba]): + proba['top_pred'] = proba.idxmax(axis=1) + proba['top_proba'] = proba.max(axis=1, skipna=True) + proba['clusterable'] = self._clusterable( + self.X_test_for_purpose).astype(bool) + proba = pd.concat([proba], keys=[label_type], axis=1) + proba_dfs += [proba] + + self.proba_df = pd.concat(proba_dfs, axis=1) + return self.proba_df + + def _get_X_test_for_purpose(self, test_df): + """ Do the pre-processing to get data that we can then pass into the + ensemble classifiers. + """ + if self.loc_feature == 'cluster': + # get clusters + self.end_cluster_model.predict(test_df) + clusters_to_encode = self.end_cluster_model.test_df[[ + 'end_cluster_idx' + ]].copy() # copy is to avoid SettingWithCopyWarning + + if self.use_start_clusters or self.use_trip_clusters: + self.start_cluster_model.predict(test_df) + + if self.use_start_clusters: + clusters_to_encode = pd.concat([ + clusters_to_encode, + self.start_cluster_model.test_df[['start_cluster_idx']] + ], + axis=1) + if self.use_trip_clusters: + start_end_clusters = pd.concat([ + self.end_cluster_model.test_df[['end_cluster_idx']], + self.start_cluster_model.test_df[['start_cluster_idx']] + ], + axis=1) + trip_cluster_idx = self.trip_grouper.transform( + start_end_clusters) + clusters_to_encode.loc[:, + 'trip_cluster_idx'] = trip_cluster_idx + + # one-hot encode the cluster indices + loc_features_df = self.cluster_enc.transform(clusters_to_encode) + else: # self.loc_feature == 'coordinates' + test_df = self._clean_data(test_df) + loc_features_df = test_df[[ + 'start_lon', 'start_lat', 'end_lon', 'end_lat' + ]] + + # extract the desired data + X_test = pd.concat([ + test_df[self.base_features].reset_index(drop=True), + loc_features_df.reset_index(drop=True) + ], + axis=1) + + return X_test + + def _try_predict_proba_mode_replaced(self): + """ Try to predict mode and replaced-mode. Handles error in case the + ensemble algorithms were not fitted. + + Requires self.X_test_for_mode to have already been set. (These are + the DataFrames containing the test data to be passed into self. + mode_predictor.) + + Returns: mode_proba and replaced_proba, two DataFrames containing + class probabilities for mode and replaced-mode respectively + """ + + try: + # predict mode + mode_proba_raw = self.mode_predictor.predict_proba( + self.X_test_for_mode) + mode_proba = pd.DataFrame(mode_proba_raw, + columns=self.mode_predictor.classes_) + mode_pred = mode_proba.idxmax(axis=1) + + # update X_test with one-hot-encoded mode predictions to aid + # replaced-mode predictor + onehot_mode_df = self.mode_enc.transform( + pd.DataFrame(mode_pred).set_index(self.X_test_for_mode.index)) + self.X_test_for_replaced = pd.concat( + [self.X_test_for_mode, onehot_mode_df], axis=1) + replaced_proba = self._try_predict_proba_replaced() + + except NotFittedError as e: + mode_proba_raw = np.full((len(self.X_test_for_mode), 1), 0) + mode_proba = pd.DataFrame(mode_proba_raw, columns=[np.nan]) + + # if we don't have mode predictions, we *could* still try to + # predict replaced mode (but if the user didn't input mode labels + # then it's unlikely they would input replaced-mode) + self.X_test_for_replaced = self.X_test_for_mode + replaced_proba = self._try_predict_proba_replaced() + + return mode_proba, replaced_proba + + def _try_predict_proba_replaced(self): + """ Try to predict replaced mode. Handles error in case the + replaced_predictor was not fitted. + + Requires self.X_test_for_replaced to have already been set. (This + is the DataFrame containing the test data to be passed into self. + replaced_predictor.) + + Returns: replaced_proba, DataFrame containing class probabilities + for replaced-mode + """ + try: + replaced_proba_raw = self.replaced_predictor.predict_proba( + self.X_test_for_replaced + ) # has shape (len_trips, number of replaced_mode classes) + replaced_proba = pd.DataFrame( + replaced_proba_raw, columns=self.replaced_predictor.classes_) + + except NotFittedError as e: + replaced_proba_raw = np.full((len(self.X_test_for_replaced), 1), 0) + replaced_proba = pd.DataFrame(replaced_proba_raw, columns=[np.nan]) + + return replaced_proba + + def _clusterable(self, test_df): + """ Check if the end points can be clustered (i.e. are within + meters of an end point from the training set) + """ + if self.loc_feature == 'cluster': + return self.end_cluster_model.test_df.end_cluster_idx >= 0 + + n_samples = test_df.shape[0] + clustered = np.ones(shape=n_samples, dtype=int) * False + + train_coordinates = self.train_df[['end_lat', 'end_lon']] + train_radians = np.radians(train_coordinates) + + for idx, row in test_df.reset_index(drop=True).iterrows(): + # calculate the distances between the ith test data and all points, + # then find the minimum distance for each point and check if it's + # within the distance threshold. + # unfortunately, pairwise_distances_argmin() does not support + # haversine distance, so we have to reimplement it ourselves + new_loc_radians = np.radians(row[["end_lat", "end_lon"]].to_list()) + new_loc_radians = np.reshape(new_loc_radians, (1, 2)) + dist_matrix_meters = haversine_distances( + new_loc_radians, train_radians) * EARTH_RADIUS + + shortest_dist = np.min(dist_matrix_meters) + if shortest_dist < self.radius: + clustered[idx] = True + + return clustered + + +class ForestClassifierModel(EnsembleClassifier): + """ Random forest-based trip classifier. + + Args: + loc_feature (str): 'coordinates' or 'cluster'; whether to use lat/ + lon coordinates or cluster indices for the location feature + radius (int): radius for DBSCAN clustering. only if + loc_feature=='cluster' + size_thresh (int): the min number of trips a cluster must have to + be considered for sub-division via SVM. only if + loc_feature=='cluster' + purity_thresh (float): the min purity a cluster must have to be + sub-divided via SVM. only if loc_feature=='cluster' + gamma (float): coefficient for the rbf kernel in SVM. only if + loc_feature=='cluster' + C (float): regularization hyperparameter for SVM. only if + loc_feature=='cluster' + n_estimators (int): number of estimators in the random forest + criterion (str): function to measure the quality of a split in the + random forest + max_depth (int): max depth of a tree in the random forest. + unlimited if None. + min_samples_split (int): min number of samples required to split an + internal node in a decision tree + min_samples_leaf (int): min number of samples required for a leaf + node in a decision tree + max_features (str): number of features to consider when looking for + the best split in a decision tree + bootstrap (bool): whether bootstrap samples are used when building + decision trees + random_state (int): random state for deterministic random forest + construction + use_start_clusters (bool): whether or not to use start clusters as + input features to the ensemble classifier. only if + loc_feature=='cluster' + use_trip_clusters (bool): whether or not to use trip-level clusters + as input features to the ensemble classifier. only if + loc_feature=='cluster' + """ + + def __init__(self,config): + + self.loc_feature = config['loc_feature'] + self.radius = config['radius'] + self.size_thresh = config['size_thresh'] + self.purity_thresh = config['purity_thresh'] + self.gamma = config['gamma'] + self.C = config['C'] + self.n_estimators = config['n_estimators'] + self.criterion =config['criterion'] + self.max_depth = config['max_depth'] if config['max_depth'] != 'null' else None + self.min_samples_split = config['min_samples_split'] + self.min_samples_leaf = config['min_samples_leaf'] + self.max_features = config['max_features'] + self.bootstrap = config['bootstrap'] + self.random_state = config['random_state'] + # self.drop_unclustered = drop_unclustered + self.use_start_clusters = config['use_start_clusters'] + self.use_trip_clusters = config['use_trip_clusters'] + self.base_features = [ + 'duration', + 'distance', + 'start_local_dt_year', + 'start_local_dt_month', + 'start_local_dt_day', + 'start_local_dt_hour', + 'start_local_dt_weekday', + 'end_local_dt_year', # most likely the same as the start year + 'end_local_dt_month', # most likely the same as the start month + 'end_local_dt_day', + 'end_local_dt_hour', + 'end_local_dt_weekday', + ] + self.targets = ['mode_true', 'purpose_true', 'replaced_true'] + + if self.loc_feature == 'cluster': + # clustering algorithm to generate end clusters + self.end_cluster_model = DBSCANSVMCluster( + loc_type='end', + radius=self.radius, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + + if self.use_start_clusters or self.use_trip_clusters: + # clustering algorithm to generate start clusters + self.start_cluster_model = DBSCANSVMCluster( + loc_type='start', + radius=self.radius, + size_thresh=self.size_thresh, + purity_thresh=self.purity_thresh, + gamma=self.gamma, + C=self.C) + + if self.use_trip_clusters: + # helper class to generate trip-level clusters + self.trip_grouper = TripGrouper( + start_cluster_col='start_cluster_idx', + end_cluster_col='end_cluster_idx') + + # wrapper class to generate one-hot encodings for cluster indices + self.cluster_enc = OneHotWrapper(sparse=False, + handle_unknown='ignore') + + # wrapper class to generate one-hot encodings for purposes and modes + self.purpose_enc = OneHotWrapper(impute_missing=True, + sparse=False, + handle_unknown='error') + self.mode_enc = OneHotWrapper(impute_missing=True, + sparse=False, + handle_unknown='error') + + # ensemble classifiers for each label category + self.purpose_predictor = RandomForestClassifier( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + bootstrap=self.bootstrap, + random_state=self.random_state) + self.mode_predictor = RandomForestClassifier( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + bootstrap=self.bootstrap, + random_state=self.random_state) + self.replaced_predictor = RandomForestClassifier( + n_estimators=self.n_estimators, + criterion=self.criterion, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_features=self.max_features, + bootstrap=self.bootstrap, + random_state=self.random_state) + + +class TripGrouper(): + """ Helper class to get trip clusters from start and end clusters. + + Args: + start_cluster_col (str): name of the column containing start + cluster indices + end_cluster_col (str): name of the column containing end cluster + indices + """ + + def __init__(self, + start_cluster_col='start_cluster_idx', + end_cluster_col='end_cluster_idx'): + self.start_cluster_col = start_cluster_col + self.end_cluster_col = end_cluster_col + + def fit_transform(self, trip_df): + """ Fit and remember possible trip clusters. + + Args: + trip_df (DataFrame): DataFrame containing trips. must have + columns and + """ + trip_groups = trip_df.groupby( + [self.start_cluster_col, self.end_cluster_col]) + + # need dict so we can access the trip indices of all the trips in each + # group. the key is the group tuple and the value is the list of trip + # indices in the group. + self.trip_groups_dict = dict(trip_groups.groups) + + # we want to convert trip-group tuples to to trip-cluster indices, + # hence the pd Series + trip_groups_series = pd.Series(list(self.trip_groups_dict.keys())) + + trip_cluster_idx = np.empty(len(trip_df)) + + for group_idx in range(len(trip_groups_series)): + group_tuple = trip_groups_series[group_idx] + trip_idxs_in_group = self.trip_groups_dict[group_tuple] + trip_cluster_idx[trip_idxs_in_group] = group_idx + + return trip_cluster_idx + + def transform(self, new_trip_df): + """ Get trip clusters for a new set of trips. + + Args: + new_trip_df (DataFrame): DataFrame containing trips. must have + columns and + """ + prediction_trip_groups = new_trip_df.groupby( + [self.start_cluster_col, self.end_cluster_col]) + + # need dict so we can access the trip indices of all the trips in each + # group. the key is the group tuple and the value is the list of trip + # indices in the group. + prediction_trip_groups_dict = dict(prediction_trip_groups.groups) + trip_groups_series = pd.Series(list(self.trip_groups_dict.keys())) + trip_cluster_idx = np.empty(len(new_trip_df)) + + for group_tuple in dict(prediction_trip_groups.groups).keys(): + # check if the trip cluster exists in the training set + trip_idxs_in_group = prediction_trip_groups_dict[group_tuple] + if group_tuple in self.trip_groups_dict.keys(): + # look up the group index from the series we created when we + # fit the model + group_idx = trip_groups_series[trip_groups_series == + group_tuple].index[0] + else: + group_idx = -1 + + trip_cluster_idx[trip_idxs_in_group] = group_idx + + return trip_cluster_idx + + + +class OneHotWrapper(): + """ Helper class to streamline one-hot encoding. + + Args: + impute_missing (bool): whether or not to impute np.nan values. + sparse (bool): whether or not to return a sparse matrix. + handle_unknown (str): specifies the way unknown categories are + handled during transform. + """ + + def __init__( + self, + impute_missing=False, + sparse=False, + handle_unknown='ignore', + ): + self.impute_missing = impute_missing + if self.impute_missing: + self.encoder = make_pipeline( + SimpleImputer(missing_values=np.nan, + strategy='constant', + fill_value='missing'), + OneHotEncoder(sparse=False, handle_unknown=handle_unknown)) + else: + self.encoder = OneHotEncoder(sparse=sparse, + handle_unknown=handle_unknown) + + def fit_transform(self, train_df, output_col_prefix=None): + """ Establish one-hot encoded variables. + + Args: + train_df (DataFrame): DataFrame containing train trips. + output_col_prefix (str): only if train_df is a single column + """ + # TODO: handle pd series + + train_df = train_df.copy() # to avoid SettingWithCopyWarning + + # if imputing, the dtype of each column must be string/object and not + # numerical, otherwise the SimpleImputer will fail + if self.impute_missing: + for col in train_df.columns: + train_df[col] = train_df[col].astype(object) + onehot_encoding = self.encoder.fit_transform(train_df) + self.onehot_encoding_cols_all = [] + for col in train_df.columns: + if train_df.shape[1] > 1 or output_col_prefix is None: + output_col_prefix = col + self.onehot_encoding_cols_all += [ + f'{output_col_prefix}_{val}' + for val in np.sort(train_df[col].dropna().unique()) + ] + # we handle np.nan separately because it is of type float, and may + # cause issues with np.sort if the rest of the unique values are + # strings + if any((train_df[col].isna())): + self.onehot_encoding_cols_all += [f'{output_col_prefix}_nan'] + + onehot_encoding_df = pd.DataFrame( + onehot_encoding, + columns=self.onehot_encoding_cols_all).set_index(train_df.index) + + # ignore the encoded columns for missing entries + self.onehot_encoding_cols = copy.deepcopy(self.onehot_encoding_cols_all) + for col in self.onehot_encoding_cols_all: + if col.endswith('_nan'): + onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) + self.onehot_encoding_cols.remove(col) + + return onehot_encoding_df.astype(int) + + def transform(self, test_df): + """ One-hot encoded features in accordance with features seen in the + train set. + + Args: + test_df (DataFrame): DataFrame of trips. + """ + # TODO: rename test_df, this one doesn't necessarily need to be a df + onehot_encoding = self.encoder.transform(test_df) + onehot_encoding_df = pd.DataFrame( + onehot_encoding, + columns=self.onehot_encoding_cols_all).set_index(test_df.index) + + # ignore the encoded columns for missing entries + for col in self.onehot_encoding_cols_all: + if col.endswith('_nan'): + onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) + + return onehot_encoding_df.astype(int) diff --git a/emission/core/wrapper/entry.py b/emission/core/wrapper/entry.py index b4d8520f7..a11eaac8c 100644 --- a/emission/core/wrapper/entry.py +++ b/emission/core/wrapper/entry.py @@ -182,6 +182,9 @@ def create_fake_entry(user_id, key, data, write_ts, create_id=False): result_entry.user_id = user_id result_entry.metadata = ecwm.Metadata.create_metadata_for_fake_result(key, write_ts) result_entry.data = data + #necessary values required by forest model + result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt'] + result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt'] result_entry._populateDependencies() return result_entry diff --git a/emission/tests/modellingTests/TestForestModel.py b/emission/tests/modellingTests/TestForestModel.py index f477f1ab9..8895cb366 100644 --- a/emission/tests/modellingTests/TestForestModel.py +++ b/emission/tests/modellingTests/TestForestModel.py @@ -119,54 +119,55 @@ def tearDown(self): # logging.debug("Model predictions are consistent with previously stored predictions.") - - def test_regression(self): - """ - Regression test to ensure consistent model results. - """ - # Load the previously stored predictions (if any) - previous_predictions = self.load_previous_predictions() +## TODO : Fix regression Tests + + # def test_regression(self): + # """ + # Regression test to ensure consistent model results. + # """ + # # Load the previously stored predictions (if any) + # previous_predictions = self.load_previous_predictions() - # Run the current model to get predictions - current_predictions = self.run_current_model() - - # If there are no previous predictions, store the current predictions - if previous_predictions is None: - self.store_predictions(current_predictions) - else: - # Compare the current predictions with the previous predictions - self.assertPredictionsMatch(previous_predictions, current_predictions) - - def load_previous_predictions(self): - # Retrieve stored predictions from the database - # Using get_analysis_timeseries_db as an example, replace with the correct method if needed - db = edb.get_analysis_timeseries_db() - predictions = db.find_one({"user_id": self.user_id, "metadata.key": "predictions"}) - return predictions - - def run_current_model(self): - # Placeholder: Run the current model and get predictions - # Replace this with the actual model running code - predictions = None - return predictions - - def store_predictions(self, predictions): - # Store the predictions in the database - # Using get_analysis_timeseries_db as an example, replace with the correct method if needed - db = edb.get_analysis_timeseries_db() - entry = { - "user_id": self.user_id, - "metadata": { - "key": "predictions", - "write_ts": pd.Timestamp.now().timestamp() # Using pandas timestamp as an example - }, - "data": predictions - } - db.insert_one(entry) - - def assertPredictionsMatch(self, prev, curr): - # Placeholder: Check if the predictions match - # This will depend on the format and type of your predictions - # For example, if predictions are lists or arrays, you can use numpy - if not np.array_equal(prev, curr): - self.fail("Current model predictions do not match previously stored predictions!") + # # Run the current model to get predictions + # current_predictions = self.run_current_model() + + # # If there are no previous predictions, store the current predictions + # if previous_predictions is None: + # self.store_predictions(current_predictions) + # else: + # # Compare the current predictions with the previous predictions + # self.assertPredictionsMatch(previous_predictions, current_predictions) + + # def load_previous_predictions(self): + # # Retrieve stored predictions from the database + # # Using get_analysis_timeseries_db as an example, replace with the correct method if needed + # db = edb.get_analysis_timeseries_db() + # predictions = db.find_one({"user_id": self.user_id, "metadata.key": "predictions"}) + # return predictions + + # def run_current_model(self): + # # Placeholder: Run the current model and get predictions + # # Replace this with the actual model running code + # predictions = None + # return predictions + + # def store_predictions(self, predictions): + # # Store the predictions in the database + # # Using get_analysis_timeseries_db as an example, replace with the correct method if needed + # db = edb.get_analysis_timeseries_db() + # entry = { + # "user_id": self.user_id, + # "metadata": { + # "key": "predictions", + # "write_ts": pd.Timestamp.now().timestamp() # Using pandas timestamp as an example + # }, + # "data": predictions + # } + # db.insert_one(entry) + + # def assertPredictionsMatch(self, prev, curr): + # # Placeholder: Check if the predictions match + # # This will depend on the format and type of your predictions + # # For example, if predictions are lists or arrays, you can use numpy + # if not np.array_equal(prev, curr): + # self.fail("Current model predictions do not match previously stored predictions!") diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py index 382ef4074..2edcc92de 100644 --- a/emission/tests/modellingTests/TestRunForestModel.py +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -140,8 +140,8 @@ def testTrainForestModelWithZeroTrips(self): "pipeline should not have a current timestamp for the test user") - def testPredictForestModelWithZeroTrips(self): - """ + def test1RoundPredictForestModel(self): + """ forest model takes config arguments via the constructor for testing purposes but will load from a file in /conf/analysis/ which is tested here """ @@ -175,13 +175,9 @@ def testPredictForestModelWithZeroTrips(self): ) logging.debug(f'(TEST) testing prediction of stored model') - test = etmm.build_mock_trip( - user_id=self.user_id, - origin=self.origin, - destination=self.destination - ) + test = esda.get_entries(key="analysis/confirmed_trip", user_id=self.user_id, time_query=None) prediction, n = eamur.predict_labels_with_n( - trip = test, + trip = test[0], model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, model_config=forest_model_config diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 252b2ad34..2bb1a958e 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -166,7 +166,10 @@ def build_mock_trip( "type": "Point", "coordinates": destination }, - "user_input": labels + #necessary valued for random forest model + "user_input": labels, + "duration": end_ts-start_ts, + "distance": ecc.calDistance(origin,destination) } return ecwe.Entry.create_fake_entry(user_id, key, data, write_ts=time.time()) From 1d7be5a60f4e386b1ce6a72676131245e3ab0ccc Mon Sep 17 00:00:00 2001 From: $aTyam Date: Thu, 2 Nov 2023 18:00:44 -0400 Subject: [PATCH 12/28] Minor fixes Removed redundancies and unnecessary code segments. --- .../modelling/trip_model/config copy.py | 79 +++++++++++++++ .../trip_model/greedy_similarity_binning.py | 1 - .../modelling/trip_model/run_model.py | 4 +- .../analysis/modelling/trip_model/util.py | 98 ------------------- 4 files changed, 81 insertions(+), 101 deletions(-) create mode 100644 emission/analysis/modelling/trip_model/config copy.py diff --git a/emission/analysis/modelling/trip_model/config copy.py b/emission/analysis/modelling/trip_model/config copy.py new file mode 100644 index 000000000..76b3c6e6d --- /dev/null +++ b/emission/analysis/modelling/trip_model/config copy.py @@ -0,0 +1,79 @@ +import json +import re +from this import d +from typing import Optional +import logging +from numpy import isin + +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.analysis.modelling.trip_model.model_type as eamumt + +config_filename = "" + +def load_config(): + global config_filename + try: + config_filename = 'conf/analysis/trip_model.conf.json' + config_file = open(config_filename) + except: + print("analysis.trip_model.conf.json not configured, falling back to sample, default configuration") + config_filename = 'conf/analysis/trip_model.conf.json.sample' + config_file = open('conf/analysis/trip_model.conf.json.sample') + ret_val = json.load(config_file) + config_file.close() + return ret_val + +config_data = load_config() + +def reload_config(): + global config_data + config_data = load_config() + +def get_config(): + return config_data + +def get_optional_config_value(key) -> Optional[str]: + """ + get a config value at the provided path/key + + :param key: a key name or a dot-delimited path to some key within the config object + :return: the value at the key, or, None if not found + """ + cursor = config_data + path = key.split(".") + for k in path: + cursor = cursor.get(k) + if cursor is None: + return None + return cursor + +def get_config_value_or_raise(key): + logging.debug(f'getting key {key} in config') + value = get_optional_config_value(key) + if value is None: + logging.debug('config object:') + logging.debug(json.dumps(config_data, indent=2)) + msg = f"expected config key {key} not found in config file {config_filename}" + raise KeyError(msg) + else: + return value + +def get_model_type(): + model_type_str = get_config_value_or_raise('model_type') + model_type = eamumt.ModelType.from_str(model_type_str) + return model_type + +def get_model_storage(): + model_storage_str = get_config_value_or_raise('model_storage') + model_storage = eamums.ModelStorage.from_str(model_storage_str) + return model_storage + +def get_minimum_trips(): + minimum_trips = get_config_value_or_raise('minimum_trips') + if not isinstance(minimum_trips, int): + msg = f"config key 'minimum_trips' not an integer in config file {config_filename}" + raise TypeError(msg) + return minimum_trips + + + diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index a19f5e5c0..226fdefb5 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -133,7 +133,6 @@ def fit(self, trips: List[ecwc.Confirmedtrip]): corresponds to a label at the matching index of the label input :param trips: 2D array of features to train from - :param tripsdf: trips data in dataframe format """ logging.debug(f'fit called with {len(trips)} trips') diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index 63f1f2ef0..3e7f6c11b 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -72,8 +72,8 @@ def update_trip_model( epq.mark_trip_model_failed(user_id) else: - # train and store the model. pass both List of event and dataframe time data - # that both standard( which mostly work on df) and self implemented models can use. + # train and store the model. pass only List of event and only convert + # to dataframe type data whereever required. model.fit(trips) model_data_next = model.to_dict() diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index 0728fb702..b3a9a012c 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -3,12 +3,6 @@ import numpy as np import pandas as pd from numpy.linalg import norm -import copy - -from sklearn.preprocessing import OneHotEncoder -from sklearn.pipeline import make_pipeline -from sklearn.impute import SimpleImputer - def find_knee_point(values: List[float]) -> Tuple[float, int]: """for a list of values, find the value which represents the cut-off point @@ -76,95 +70,3 @@ def single_cluster_purity(points_in_cluster, label_col='purpose_confirm'): purity = len(points_in_cluster[points_in_cluster[label_col] == most_freq_label]) / len(points_in_cluster) return purity - - -class OneHotWrapper(): - """ Helper class to streamline one-hot encoding. - - Args: - impute_missing (bool): whether or not to impute np.nan values. - sparse (bool): whether or not to return a sparse matrix. - handle_unknown (str): specifies the way unknown categories are - handled during transform. - """ - - def __init__( - self, - impute_missing=False, - sparse=False, - handle_unknown='ignore', - ): - self.impute_missing = impute_missing - if self.impute_missing: - self.encoder = make_pipeline( - SimpleImputer(missing_values=np.nan, - strategy='constant', - fill_value='missing'), - OneHotEncoder(sparse=False, handle_unknown=handle_unknown)) - else: - self.encoder = OneHotEncoder(sparse=sparse, - handle_unknown=handle_unknown) - - def fit_transform(self, train_df, output_col_prefix=None): - """ Establish one-hot encoded variables. - - Args: - train_df (DataFrame): DataFrame containing train trips. - output_col_prefix (str): only if train_df is a single column - """ - # TODO: handle pd series - - train_df = train_df.copy() # to avoid SettingWithCopyWarning - - # if imputing, the dtype of each column must be string/object and not - # numerical, otherwise the SimpleImputer will fail - if self.impute_missing: - for col in train_df.columns: - train_df[col] = train_df[col].astype(object) - onehot_encoding = self.encoder.fit_transform(train_df) - self.onehot_encoding_cols_all = [] - for col in train_df.columns: - if train_df.shape[1] > 1 or output_col_prefix is None: - output_col_prefix = col - self.onehot_encoding_cols_all += [ - f'{output_col_prefix}_{val}' - for val in np.sort(train_df[col].dropna().unique()) - ] - # we handle np.nan separately because it is of type float, and may - # cause issues with np.sort if the rest of the unique values are - # strings - if any((train_df[col].isna())): - self.onehot_encoding_cols_all += [f'{output_col_prefix}_nan'] - - onehot_encoding_df = pd.DataFrame( - onehot_encoding, - columns=self.onehot_encoding_cols_all).set_index(train_df.index) - - # ignore the encoded columns for missing entries - self.onehot_encoding_cols = copy.deepcopy(self.onehot_encoding_cols_all) - for col in self.onehot_encoding_cols_all: - if col.endswith('_nan'): - onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) - self.onehot_encoding_cols.remove(col) - - return onehot_encoding_df.astype(int) - - def transform(self, test_df): - """ One-hot encoded features in accordance with features seen in the - train set. - - Args: - test_df (DataFrame): DataFrame of trips. - """ - # TODO: rename test_df, this one doesn't necessarily need to be a df - onehot_encoding = self.encoder.transform(test_df) - onehot_encoding_df = pd.DataFrame( - onehot_encoding, - columns=self.onehot_encoding_cols_all).set_index(test_df.index) - - # ignore the encoded columns for missing entries - for col in self.onehot_encoding_cols_all: - if col.endswith('_nan'): - onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) - - return onehot_encoding_df.astype(int) \ No newline at end of file From b3d0db2b15342c5481727406052f2e46620fc4b6 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Fri, 3 Nov 2023 00:08:08 -0400 Subject: [PATCH 13/28] Delete Config file Config copy not required. --- .../modelling/trip_model/config copy.py | 79 ------------------- 1 file changed, 79 deletions(-) delete mode 100644 emission/analysis/modelling/trip_model/config copy.py diff --git a/emission/analysis/modelling/trip_model/config copy.py b/emission/analysis/modelling/trip_model/config copy.py deleted file mode 100644 index 76b3c6e6d..000000000 --- a/emission/analysis/modelling/trip_model/config copy.py +++ /dev/null @@ -1,79 +0,0 @@ -import json -import re -from this import d -from typing import Optional -import logging -from numpy import isin - -import emission.analysis.modelling.trip_model.model_storage as eamums -import emission.analysis.modelling.trip_model.model_type as eamumt - -config_filename = "" - -def load_config(): - global config_filename - try: - config_filename = 'conf/analysis/trip_model.conf.json' - config_file = open(config_filename) - except: - print("analysis.trip_model.conf.json not configured, falling back to sample, default configuration") - config_filename = 'conf/analysis/trip_model.conf.json.sample' - config_file = open('conf/analysis/trip_model.conf.json.sample') - ret_val = json.load(config_file) - config_file.close() - return ret_val - -config_data = load_config() - -def reload_config(): - global config_data - config_data = load_config() - -def get_config(): - return config_data - -def get_optional_config_value(key) -> Optional[str]: - """ - get a config value at the provided path/key - - :param key: a key name or a dot-delimited path to some key within the config object - :return: the value at the key, or, None if not found - """ - cursor = config_data - path = key.split(".") - for k in path: - cursor = cursor.get(k) - if cursor is None: - return None - return cursor - -def get_config_value_or_raise(key): - logging.debug(f'getting key {key} in config') - value = get_optional_config_value(key) - if value is None: - logging.debug('config object:') - logging.debug(json.dumps(config_data, indent=2)) - msg = f"expected config key {key} not found in config file {config_filename}" - raise KeyError(msg) - else: - return value - -def get_model_type(): - model_type_str = get_config_value_or_raise('model_type') - model_type = eamumt.ModelType.from_str(model_type_str) - return model_type - -def get_model_storage(): - model_storage_str = get_config_value_or_raise('model_storage') - model_storage = eamums.ModelStorage.from_str(model_storage_str) - return model_storage - -def get_minimum_trips(): - minimum_trips = get_config_value_or_raise('minimum_trips') - if not isinstance(minimum_trips, int): - msg = f"config key 'minimum_trips' not an integer in config file {config_filename}" - raise TypeError(msg) - return minimum_trips - - - From 3b038a955e678df5a799385227976d0af86f38fe Mon Sep 17 00:00:00 2001 From: $aTyam Date: Fri, 3 Nov 2023 18:19:36 -0400 Subject: [PATCH 14/28] removedfile --- .../analysis/modelling/trip_model/models.py | 1194 ----------------- 1 file changed, 1194 deletions(-) delete mode 100644 emission/analysis/modelling/trip_model/models.py diff --git a/emission/analysis/modelling/trip_model/models.py b/emission/analysis/modelling/trip_model/models.py deleted file mode 100644 index a8da464c4..000000000 --- a/emission/analysis/modelling/trip_model/models.py +++ /dev/null @@ -1,1194 +0,0 @@ -######################################################################## -## Copied from /e-mission-eval-private-data/TRB_label_assist/models.py## -######################################################################## - - - -import pandas as pd -import numpy as np -from abc import ABCMeta, abstractmethod # to define abstract class "blueprints" -import logging -import copy - -# sklearn imports -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler, OneHotEncoder -from sklearn.impute import SimpleImputer -from sklearn.metrics.pairwise import haversine_distances -from sklearn.cluster import DBSCAN -from sklearn import svm -from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier -from sklearn.tree import DecisionTreeClassifier -from sklearn.exceptions import NotFittedError - -# NOTE: tour_model_extended.similarity is on the -# eval-private-data-compatibility branch in e-mission-server - -# logging.basicConfig(level=logging.DEBUG) - -EARTH_RADIUS = 6371000 - -############################# -## define abstract classes ## -############################# - - -class SetupMixin(metaclass=ABCMeta): - """ class containing code to be reused when setting up estimators. """ - - def _clean_data(self, df): - """ Clean a dataframe of trips. - (Drop trips with missing start/end locations, expand the user input - columns, ensure all essential columns are present) - - Args: - df: a dataframe of trips. must contain the columns 'start_loc', - 'end_loc', and should also contain the user input columns - ('mode_confirm', 'purpose_confirm', 'replaced_mode') if - available - """ - assert 'start_loc' in df.columns and 'end_loc' in df.columns - - # clean up the dataframe by dropping entries with NaN locations and - # reset index - num_nan = 0 - if df.start_loc.isna().any(): - num_nan += df.start_loc.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['start_loc']) - if df.end_loc.isna().any(): - num_nan += df.end_loc.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['end_loc']) - - # expand the 'start_loc' and 'end_loc' column into 'start_lat', - # 'start_lon', 'end_lat', and 'end_lon' columns - df = self.expand_coords(df) - - # drop trips with missing coordinates - if df.start_lat.isna().any(): - num_nan += df.start_lat.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['start_lat']) - if df.start_lon.isna().any(): - num_nan += df.start_lon.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['start_lon']) - if df.end_lat.isna().any(): - num_nan += df.end_lat.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['end_lat']) - if df.end_lon.isna().any(): - num_nan = df.end_lon.value_counts(dropna=False).loc[np.nan] - df += df.dropna(subset=['end_lon']) - if num_nan > 0: - logging.info( - f'dropped {num_nan} trips that are missing location coordinates' - ) - - df = df.rename( - columns={ - 'mode_confirm': 'mode_true', - 'purpose_confirm': 'purpose_true', - 'replaced_mode': 'replaced_true' - }) - - for category in ['mode_true', 'purpose_true', 'replaced_true']: - if category not in df.columns: - # for example, if a user labels all their trip modes but none of their trip purposes - df.loc[:, category] = np.nan - - return df.reset_index(drop=True) - - def expand_coords(self,exp_df, purpose=None): - """ - copied and modifed from get_loc_df_for_purpose() in the 'Radius - selection' notebook - """ - purpose_trips = exp_df - if purpose is not None: - purpose_trips = exp_df[exp_df.purpose_confirm == purpose] - - dfs = [purpose_trips] - for loc_type in ['start', 'end']: - df = pd.DataFrame( - purpose_trips[loc_type + - "_loc"].apply(lambda p: p["coordinates"]).to_list(), - columns=[loc_type + "_lon", loc_type + "_lat"]) - df = df.set_index(purpose_trips.index) - dfs.append(df) - - # display.display(end_loc_df.head()) - return pd.concat(dfs, axis=1) - - -class Cluster(SetupMixin, metaclass=ABCMeta): - """ blueprint for clustering models. """ - - @abstractmethod - def fit(self, train_df,ct_entry=None): - """ Fit the clustering algorithm. - - Args: - train_df (DataFrame): dataframe of labeled trips - ct_entry (List) : A list of Entry type of labeled and unlabeled trips - - Returns: - self - """ - raise NotImplementedError - - @abstractmethod - def predict(self, test_df): - """ Predict cluster indices for trips, if possible. Trips that could - not be clustered will have the index -1. - - Args: - test_df (DataFrame): dataframe of test trips - - Returns: - pd DataFrame containing one column, 'start_cluster_idx' or - 'end_cluster_idx' - """ - raise NotImplementedError - - def fit_predict(self, train_df): - """ Fit the clustering algorithm and predict cluster indices for trips, - if possible. Trips that could not be clustered will have the index -1. - - Args: - train_df (DataFrame): dataframe of labeled trips - - Returns: - pd DataFrame containing one column, 'start_cluster_idx' or - 'end_cluster_idx' - """ - self.fit(train_df) - return self.predict(train_df) - - -class TripClassifier(SetupMixin, metaclass=ABCMeta): - - @abstractmethod - def fit(self, train_df,ct_entry=None): - """ Fit a classification model. - - Args: - train_df (DataFrame): dataframe of labeled trips - ct_entry (List) : A list of Entry type of labeled and unlabeled trips - - Returns: - self - """ - raise NotImplementedError - - def predict(self, test_df): - """ Predict trip labels. - - Args: - test_df (DataFrame): dataframe of trips - - Returns: - DataFrame containing the following columns: - 'purpose_pred', 'mode_pred', 'replaced_pred', - 'purpose_proba', 'mode_proba', 'replaced_proba' - the *_pred columns contain the most-likely label prediction - (string for a label or float for np.nan). - the *_proba columns contain the probability of the most-likely - prediction. - """ - proba_df = self.predict_proba(test_df) - prediction_df = proba_df.loc[:, [('purpose', 'top_pred'), - ('purpose', 'top_proba'), - ('mode', 'top_pred'), - ('mode', 'top_proba'), - ('replaced', 'top_pred'), - ('replaced', 'top_proba')]] - - prediction_df.columns = prediction_df.columns.to_flat_index() - prediction_df = prediction_df.rename( - columns={ - ('purpose', 'top_pred'): 'purpose_pred', - ('purpose', 'top_proba'): 'purpose_proba', - ('mode', 'top_pred'): 'mode_pred', - ('mode', 'top_proba'): 'mode_proba', - ('replaced', 'top_pred'): 'replaced_pred', - ('replaced', 'top_proba'): 'replaced_proba', - }) - - return prediction_df - - def fit_predict(self, train_df): - """ Fit a classification model and predict trip labels. - - Args: - train_df (DataFrame): dataframe of labeled trips - - Returns: - DataFrame containing the following columns: - 'purpose_pred', 'mode_pred', 'replaced_pred', - 'purpose_proba', 'mode_proba', 'replaced_proba' - the *_pred columns contain the most-likely label prediction - (string for a label or float for np.nan). - the *_proba columns contain the probability of the most-likely - prediction. - """ - self.fit(train_df) - return self.predict(train_df) - - @abstractmethod - def predict_proba(self, test_df): - """ Predict class probabilities for each trip. - - NOTE: check the specific model to see if the class probabilities - have confidence-discounting or not. - - Args: - test_df (DataFrame): dataframe of trips - - Returns: - DataFrame with multiindexing. Each row represents a trip. There - are 3 columns at level 1, one for each label category - ('purpose', 'mode', 'replaced'). Within each category, there is - a column for each label, with the row's entry being the - probability that the trip has the label. There are three - additional columns within each category, one indicating the - most-likely label, one indicating the probability of the - most-likely label, and one indicating whether or not the trip - can be clustered. - TODO: add a fourth optional column for the number of trips in - the cluster (if clusterable) - - Level 1 columns are: purpose, mode, replaced - Lebel 2 columns are: - , , ... top_pred, top_proba, clusterable - , , ... top_pred, top_proba, clusterable - , , ... top_pred, top_proba, clusterable - """ - raise NotImplementedError - - -class DBSCANSVMCluster(Cluster): - """ DBSCAN-based clustering algorithm that optionally implements SVM - sub-clustering. - - Args: - loc_type (str): 'start' or 'end', the type of point to cluster - radius (int): max distance between two points in each other's - neighborhood, i.e. DBSCAN's eps value. does not strictly - dictate final cluster size - size_thresh (int): the min number of trips a cluster must have - to be considered for SVM sub-division - purity_thresh (float): the min purity a cluster must have - to be sub-divided using SVM - gamma (float): coefficient for the rbf kernel in SVM - C (float): regularization hyperparameter for SVM - - Attributes: - loc_type (str) - radius (int) - size_thresh (int) - purity_thresh (float) - gamma (float) - C (float) - train_df (DataFrame) - test_df (DataFrame) - base_model (sklearn Estimator) - """ - - def __init__(self, - loc_type='end', - radius=100, - svm=True, - size_thresh=1, - purity_thresh=1.0, - gamma=0.05, - C=1): - logging.info("PERF: Initializing DBSCANSVMCluster") - self.loc_type = loc_type - self.radius = radius - self.svm = svm - self.size_thresh = size_thresh - self.purity_thresh = purity_thresh - self.gamma = gamma - self.C = C - - def set_params(self, params): - if 'loc_type' in params.keys(): self.loc_type = params['loc_type'] - if 'radius' in params.keys(): self.radius = params['radius'] - if 'svm' in params.keys(): self.svm = params['svm'] - if 'size_thresh' in params.keys(): - self.size_thresh = params['size_thresh'] - if 'purity_thresh' in params.keys(): - self.purity_thresh = params['purity_thresh'] - if 'gamma' in params.keys(): self.gamma = params['gamma'] - - return self - - def fit(self, train_df,ct_entry=None): - """ Creates clusters of trip points. - self.train_df will be updated with columns containing base and - final clusters. - - TODO: perhaps move the loc_type argument to fit() so we can use a - single class instance to cluster both start and end points. This - will also help us reduce duplicate data. - - Args: - train_df (dataframe): dataframe of labeled trips - ct_entry (List) : A list of Entry type of labeled and unlabeled trips - """ - ################## - ### clean data ### - ################## - logging.info("PERF: Fitting DBSCANSVMCluster") - self.train_df = self._clean_data(train_df) - - # we can use all trips as long as they have purpose labels. it's ok if - # they're missing mode/replaced-mode labels, because they aren't as - # strongly correlated with location compared to purpose - # TODO: actually, we may want to rethink this. for example, it will - # probably be helpful to include trips that are missing purpose labels - # but still have mode labels. - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - ######################### - ### get base clusters ### - ######################### - dist_matrix_meters = get_distance_matrix(self.train_df, self.loc_type) - self.base_model = DBSCAN(self.radius, - metric="precomputed", - min_samples=1).fit(dist_matrix_meters) - base_clusters = self.base_model.labels_ - - self.train_df.loc[:, - f'{self.loc_type}_base_cluster_idx'] = base_clusters - - ######################## - ### get sub-clusters ### - ######################## - # copy base cluster column into final cluster column - self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = self.train_df[ - f'{self.loc_type}_base_cluster_idx'] - - if self.svm: - c = 0 # count of how many clusters we have iterated over - - # iterate over all clusters and subdivide them with SVM. the while - # loop is so we can do multiple iterations of subdividing if needed - while c < self.train_df[f'{self.loc_type}_cluster_idx'].max(): - points_in_cluster = self.train_df[ - self.train_df[f'{self.loc_type}_cluster_idx'] == c] - - # only do SVM if we have the minimum num of trips in the cluster - if len(points_in_cluster) < self.size_thresh: - c += 1 - continue - - # only do SVM if purity is below threshold - purity = single_cluster_purity(points_in_cluster, - label_col='purpose_true') - if purity < self.purity_thresh: - X = points_in_cluster[[ - f"{self.loc_type}_lon", f"{self.loc_type}_lat" - ]] - y = points_in_cluster.purpose_true.to_list() - - svm_model = make_pipeline( - StandardScaler(), - svm.SVC( - kernel='rbf', - gamma=self.gamma, - C=self.C, - )).fit(X, y) - labels = svm_model.predict(X) - unique_labels = np.unique(labels) - - # if the SVM predicts that all points in the cluster have - # the same label, just ignore it and don't reindex. - # this also helps us to handle the possibility that a - # cluster may be impure but inherently inseparable, e.g. an - # end cluster at a user's home, containing 50% trips from - # work to home and 50% round trips that start and end at - # home. we don't want to reindex otherwise the low purity - # will trigger SVM again, and we will attempt & fail to - # split the cluster ad infinitum - if len(unique_labels) > 1: - # map purpose labels to new cluster indices - # we offset indices by the max existing index so that we - # don't run into any duplicate indices - max_existing_idx = self.train_df[ - f'{self.loc_type}_cluster_idx'].max() - label_to_cluster = { - unique_labels[i]: i + max_existing_idx + 1 - for i in range(len(unique_labels)) - } - # update trips with their new cluster indices - indices = np.array( - [label_to_cluster[l] for l in labels]) - self.train_df.loc[ - self.train_df[f'{self.loc_type}_cluster_idx'] == c, - f'{self.loc_type}_cluster_idx'] = indices - - c += 1 - # TODO: make things categorical at the end? or maybe at the start of the decision tree pipeline - - return self - - def fit_predict(self, train_df): - """ Override to avoid unnecessarily computation of distance matrices. - """ - self.fit(train_df) - return self.train_df[[f'{self.loc_type}_cluster_idx']] - - def predict(self, test_df): - logging.info("PERF: Predicting DBSCANSVMCluster") - # TODO: store clusters as polygons so the prediction is faster - # TODO: we probably don't want to store test_df in self to be more memory-efficient - self.test_df = self._clean_data(test_df) - pred_clusters = self._NN_predict(self.test_df) - - self.test_df.loc[:, f'{self.loc_type}_cluster_idx'] = pred_clusters - - return self.test_df[[f'{self.loc_type}_cluster_idx']] - - def _NN_predict(self, test_df): - """ Generate base-cluster predictions for the test data using a - nearest-neighbor approach. - - sklearn doesn't implement predict() for DBSCAN, which is why we - need a custom method. - """ - logging.info("PERF: NN_predicting DBSCANSVMCluster") - n_samples = test_df.shape[0] - labels = np.ones(shape=n_samples, dtype=int) * -1 - - # get coordinates of core points (we can't use model.components_ - # because our input feature was a distance matrix and doesn't contain - # info about the raw coordinates) - # NOTE: technically, every single point in a cluster is a core point - # because it has at least minPts (2) points, including itself, in its - # radius - train_coordinates = self.train_df[[ - f'{self.loc_type}_lat', f'{self.loc_type}_lon' - ]] - train_radians = np.radians(train_coordinates) - - for idx, row in test_df.reset_index(drop=True).iterrows(): - # calculate the distances between the ith test data and all points, - # then find the index of the closest point. if the ith test data is - # within epsilon of the point, then assign its cluster to the ith - # test data (otherwise, leave it as -1, indicating noise). - # unfortunately, pairwise_distances_argmin() does not support - # haversine distance, so we have to reimplement it ourselves - new_loc_radians = np.radians( - row[[self.loc_type + "_lat", self.loc_type + "_lon"]].to_list()) - new_loc_radians = np.reshape(new_loc_radians, (1, 2)) - dist_matrix_meters = haversine_distances( - new_loc_radians, train_radians) * EARTH_RADIUS - - shortest_dist_idx = np.argmin(dist_matrix_meters) - if dist_matrix_meters[0, shortest_dist_idx] < self.radius: - labels[idx] = self.train_df.reset_index( - drop=True).loc[shortest_dist_idx, - f'{self.loc_type}_cluster_idx'] - - return labels - - - -class EnsembleClassifier(TripClassifier, metaclass=ABCMeta): - """ Template class for trip classifiers using ensemble algorithms. - - Required args: - loc_feature (str): 'coordinates' or 'cluster' - """ - base_features = [ - 'duration', - 'distance', - 'start_local_dt_year', - 'start_local_dt_month', - 'start_local_dt_day', - 'start_local_dt_hour', - # 'start_local_dt_minute', - 'start_local_dt_weekday', - 'end_local_dt_year', # most likely the same as the start year - 'end_local_dt_month', # most likely the same as the start month - 'end_local_dt_day', - 'end_local_dt_hour', - # 'end_local_dt_minute', - 'end_local_dt_weekday', - ] - targets = ['mode_true', 'purpose_true', 'replaced_true'] - - # required instance attributes - loc_feature = NotImplemented - purpose_enc = NotImplemented - mode_enc = NotImplemented - purpose_predictor = NotImplemented - mode_predictor = NotImplemented - replaced_predictor = NotImplemented - - # required methods - def fit(self, train_df,ct_entry=None): - # get location features - if self.loc_feature == 'cluster': - # fit clustering model(s) and one-hot encode their indices - # TODO: consolidate start/end_cluster_model in a single instance - # that has a location_type parameter in the fit() method - self.end_cluster_model.fit(train_df) - - clusters_to_encode = self.end_cluster_model.train_df[[ - 'end_cluster_idx' - ]].copy() # copy is to avoid SettingWithCopyWarning - - if self.use_start_clusters or self.use_trip_clusters: - self.start_cluster_model.fit(train_df) - - if self.use_start_clusters: - clusters_to_encode = pd.concat([ - clusters_to_encode, - self.start_cluster_model.train_df[['start_cluster_idx']] - ], - axis=1) - if self.use_trip_clusters: - start_end_clusters = pd.concat([ - self.end_cluster_model.train_df[['end_cluster_idx']], - self.start_cluster_model.train_df[['start_cluster_idx']] - ], - axis=1) - trip_cluster_idx = self.trip_grouper.fit_transform( - start_end_clusters) - clusters_to_encode.loc[:, - 'trip_cluster_idx'] = trip_cluster_idx - - loc_features_df = self.cluster_enc.fit_transform( - clusters_to_encode.astype(int)) - - # clean the df again because we need it in the next step - # TODO: remove redundancy - self.train_df = self._clean_data(train_df) - - # TODO: move below code into a reusable function - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - else: # self.loc_feature == 'coordinates' - self.train_df = self._clean_data(train_df) - - # TODO: move below code into a reusable function - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - loc_features_df = self.train_df[[ - 'start_lon', 'start_lat', 'end_lon', 'end_lat' - ]] - - # prepare data for the ensemble classifiers - - # note that we want to use purpose data to aid our mode predictions, - # and use both purpose and mode data to aid our replaced-mode - # predictions - # thus, we want to one-hot encode the purpose and mode as data - # features, but also preserve an unencoded copy for the target columns - - # dataframe holding all features and targets - self.Xy_train = pd.concat( - [self.train_df[self.base_features + self.targets], loc_features_df], - axis=1) - - # encode purposes and modes - onehot_purpose_df = self.purpose_enc.fit_transform( - self.Xy_train[['purpose_true']], output_col_prefix='purpose') - onehot_mode_df = self.mode_enc.fit_transform( - self.Xy_train[['mode_true']], output_col_prefix='mode') - self.Xy_train = pd.concat( - [self.Xy_train, onehot_purpose_df, onehot_mode_df], axis=1) - - # for predicting purpose, drop encoded purpose and mode features, as - # well as all target labels - self.X_purpose = self.Xy_train.dropna(subset=['purpose_true']).drop( - labels=self.targets + self.purpose_enc.onehot_encoding_cols + - self.mode_enc.onehot_encoding_cols, - axis=1) - - # for predicting mode, we want to keep purpose data - self.X_mode = self.Xy_train.dropna(subset=['mode_true']).drop( - labels=self.targets + self.mode_enc.onehot_encoding_cols, axis=1) - - # for predicting replaced-mode, we want to keep purpose and mode data - self.X_replaced = self.Xy_train.dropna(subset=['replaced_true']).drop( - labels=self.targets, axis=1) - - self.y_purpose = self.Xy_train['purpose_true'].dropna() - self.y_mode = self.Xy_train['mode_true'].dropna() - self.y_replaced = self.Xy_train['replaced_true'].dropna() - - # fit classifiers - if len(self.X_purpose) > 0: - self.purpose_predictor.fit(self.X_purpose, self.y_purpose) - if len(self.X_mode) > 0: - self.mode_predictor.fit(self.X_mode, self.y_mode) - if len(self.X_replaced) > 0: - self.replaced_predictor.fit(self.X_replaced, self.y_replaced) - - return self - - def predict_proba(self, test_df): - """ NOTE: these class probabilities do NOT have a - confidence-discounting heuristic applied. - """ - ################ - ### get data ### - ################ - self.X_test_for_purpose = self._get_X_test_for_purpose(test_df) - - ######################## - ### make predictions ### - ######################## - # note that we want to use purpose data to aid our mode predictions, - # and use both purpose and mode data to aid our replaced-mode - # predictions - - # TODO: some of the code across the try and except blocks can be - # consolidated by considering one-hot encoding fully np.nan arrays - try: - purpose_proba_raw = self.purpose_predictor.predict_proba( - self.X_test_for_purpose) - purpose_proba = pd.DataFrame( - purpose_proba_raw, columns=self.purpose_predictor.classes_) - purpose_pred = purpose_proba.idxmax(axis=1) - - # update X_test with one-hot-encoded purpose predictions to aid - # mode predictor - # TODO: converting purpose_pred to a DataFrame feels super - # unnecessary, make this more efficient - onehot_purpose_df = self.purpose_enc.transform( - pd.DataFrame(purpose_pred).set_index( - self.X_test_for_purpose.index)) - self.X_test_for_mode = pd.concat( - [self.X_test_for_purpose, onehot_purpose_df], axis=1) - - mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() - - except NotFittedError as e: - # if we can't predict purpose, we can still try to predict mode and - # replaced-mode without one-hot encoding the purpose - - purpose_pred = np.full((len(self.X_test_for_purpose), ), np.nan) - purpose_proba_raw = np.full((len(self.X_test_for_purpose), 1), 0) - purpose_proba = pd.DataFrame(purpose_proba_raw, columns=[np.nan]) - - self.X_test_for_mode = self.X_test_for_purpose - mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() - - mode_pred = mode_proba.idxmax(axis=1) - replaced_pred = replaced_proba.idxmax(axis=1) - - if (purpose_pred.dtype == np.float64 and mode_pred.dtype == np.float64 - and replaced_pred.dtype == np.float64): - # this indicates that all the predictions are np.nan so none of the - # random forest classifiers were fitted - raise NotFittedError - - # TODO: move this to a Mixin for cluster-based predictors and use the - # 'cluster' column of the proba_df outputs - # if self.drop_unclustered: - # # TODO: actually, we should only drop purpose predictions. we can - # # then impute the missing entries in the purpose feature and still - # # try to predict mode and replaced-mode without it - # self.predictions.loc[ - # self.end_cluster_model.test_df['end_cluster_idx'] == -1, - # ['purpose_pred', 'mode_pred', 'replaced_pred']] = np.nan - - proba_dfs = [] - for label_type, proba in zip( - ['purpose', 'mode', 'replaced'], - [purpose_proba, mode_proba, replaced_proba]): - proba['top_pred'] = proba.idxmax(axis=1) - proba['top_proba'] = proba.max(axis=1, skipna=True) - proba['clusterable'] = self._clusterable( - self.X_test_for_purpose).astype(bool) - proba = pd.concat([proba], keys=[label_type], axis=1) - proba_dfs += [proba] - - self.proba_df = pd.concat(proba_dfs, axis=1) - return self.proba_df - - def _get_X_test_for_purpose(self, test_df): - """ Do the pre-processing to get data that we can then pass into the - ensemble classifiers. - """ - if self.loc_feature == 'cluster': - # get clusters - self.end_cluster_model.predict(test_df) - clusters_to_encode = self.end_cluster_model.test_df[[ - 'end_cluster_idx' - ]].copy() # copy is to avoid SettingWithCopyWarning - - if self.use_start_clusters or self.use_trip_clusters: - self.start_cluster_model.predict(test_df) - - if self.use_start_clusters: - clusters_to_encode = pd.concat([ - clusters_to_encode, - self.start_cluster_model.test_df[['start_cluster_idx']] - ], - axis=1) - if self.use_trip_clusters: - start_end_clusters = pd.concat([ - self.end_cluster_model.test_df[['end_cluster_idx']], - self.start_cluster_model.test_df[['start_cluster_idx']] - ], - axis=1) - trip_cluster_idx = self.trip_grouper.transform( - start_end_clusters) - clusters_to_encode.loc[:, - 'trip_cluster_idx'] = trip_cluster_idx - - # one-hot encode the cluster indices - loc_features_df = self.cluster_enc.transform(clusters_to_encode) - else: # self.loc_feature == 'coordinates' - test_df = self._clean_data(test_df) - loc_features_df = test_df[[ - 'start_lon', 'start_lat', 'end_lon', 'end_lat' - ]] - - # extract the desired data - X_test = pd.concat([ - test_df[self.base_features].reset_index(drop=True), - loc_features_df.reset_index(drop=True) - ], - axis=1) - - return X_test - - def _try_predict_proba_mode_replaced(self): - """ Try to predict mode and replaced-mode. Handles error in case the - ensemble algorithms were not fitted. - - Requires self.X_test_for_mode to have already been set. (These are - the DataFrames containing the test data to be passed into self. - mode_predictor.) - - Returns: mode_proba and replaced_proba, two DataFrames containing - class probabilities for mode and replaced-mode respectively - """ - - try: - # predict mode - mode_proba_raw = self.mode_predictor.predict_proba( - self.X_test_for_mode) - mode_proba = pd.DataFrame(mode_proba_raw, - columns=self.mode_predictor.classes_) - mode_pred = mode_proba.idxmax(axis=1) - - # update X_test with one-hot-encoded mode predictions to aid - # replaced-mode predictor - onehot_mode_df = self.mode_enc.transform( - pd.DataFrame(mode_pred).set_index(self.X_test_for_mode.index)) - self.X_test_for_replaced = pd.concat( - [self.X_test_for_mode, onehot_mode_df], axis=1) - replaced_proba = self._try_predict_proba_replaced() - - except NotFittedError as e: - mode_proba_raw = np.full((len(self.X_test_for_mode), 1), 0) - mode_proba = pd.DataFrame(mode_proba_raw, columns=[np.nan]) - - # if we don't have mode predictions, we *could* still try to - # predict replaced mode (but if the user didn't input mode labels - # then it's unlikely they would input replaced-mode) - self.X_test_for_replaced = self.X_test_for_mode - replaced_proba = self._try_predict_proba_replaced() - - return mode_proba, replaced_proba - - def _try_predict_proba_replaced(self): - """ Try to predict replaced mode. Handles error in case the - replaced_predictor was not fitted. - - Requires self.X_test_for_replaced to have already been set. (This - is the DataFrame containing the test data to be passed into self. - replaced_predictor.) - - Returns: replaced_proba, DataFrame containing class probabilities - for replaced-mode - """ - try: - replaced_proba_raw = self.replaced_predictor.predict_proba( - self.X_test_for_replaced - ) # has shape (len_trips, number of replaced_mode classes) - replaced_proba = pd.DataFrame( - replaced_proba_raw, columns=self.replaced_predictor.classes_) - - except NotFittedError as e: - replaced_proba_raw = np.full((len(self.X_test_for_replaced), 1), 0) - replaced_proba = pd.DataFrame(replaced_proba_raw, columns=[np.nan]) - - return replaced_proba - - def _clusterable(self, test_df): - """ Check if the end points can be clustered (i.e. are within - meters of an end point from the training set) - """ - if self.loc_feature == 'cluster': - return self.end_cluster_model.test_df.end_cluster_idx >= 0 - - n_samples = test_df.shape[0] - clustered = np.ones(shape=n_samples, dtype=int) * False - - train_coordinates = self.train_df[['end_lat', 'end_lon']] - train_radians = np.radians(train_coordinates) - - for idx, row in test_df.reset_index(drop=True).iterrows(): - # calculate the distances between the ith test data and all points, - # then find the minimum distance for each point and check if it's - # within the distance threshold. - # unfortunately, pairwise_distances_argmin() does not support - # haversine distance, so we have to reimplement it ourselves - new_loc_radians = np.radians(row[["end_lat", "end_lon"]].to_list()) - new_loc_radians = np.reshape(new_loc_radians, (1, 2)) - dist_matrix_meters = haversine_distances( - new_loc_radians, train_radians) * EARTH_RADIUS - - shortest_dist = np.min(dist_matrix_meters) - if shortest_dist < self.radius: - clustered[idx] = True - - return clustered - - -class ForestClassifierModel(EnsembleClassifier): - """ Random forest-based trip classifier. - - Args: - loc_feature (str): 'coordinates' or 'cluster'; whether to use lat/ - lon coordinates or cluster indices for the location feature - radius (int): radius for DBSCAN clustering. only if - loc_feature=='cluster' - size_thresh (int): the min number of trips a cluster must have to - be considered for sub-division via SVM. only if - loc_feature=='cluster' - purity_thresh (float): the min purity a cluster must have to be - sub-divided via SVM. only if loc_feature=='cluster' - gamma (float): coefficient for the rbf kernel in SVM. only if - loc_feature=='cluster' - C (float): regularization hyperparameter for SVM. only if - loc_feature=='cluster' - n_estimators (int): number of estimators in the random forest - criterion (str): function to measure the quality of a split in the - random forest - max_depth (int): max depth of a tree in the random forest. - unlimited if None. - min_samples_split (int): min number of samples required to split an - internal node in a decision tree - min_samples_leaf (int): min number of samples required for a leaf - node in a decision tree - max_features (str): number of features to consider when looking for - the best split in a decision tree - bootstrap (bool): whether bootstrap samples are used when building - decision trees - random_state (int): random state for deterministic random forest - construction - use_start_clusters (bool): whether or not to use start clusters as - input features to the ensemble classifier. only if - loc_feature=='cluster' - use_trip_clusters (bool): whether or not to use trip-level clusters - as input features to the ensemble classifier. only if - loc_feature=='cluster' - """ - - def __init__(self,config): - - self.loc_feature = config['loc_feature'] - self.radius = config['radius'] - self.size_thresh = config['size_thresh'] - self.purity_thresh = config['purity_thresh'] - self.gamma = config['gamma'] - self.C = config['C'] - self.n_estimators = config['n_estimators'] - self.criterion =config['criterion'] - self.max_depth = config['max_depth'] if config['max_depth'] != 'null' else None - self.min_samples_split = config['min_samples_split'] - self.min_samples_leaf = config['min_samples_leaf'] - self.max_features = config['max_features'] - self.bootstrap = config['bootstrap'] - self.random_state = config['random_state'] - # self.drop_unclustered = drop_unclustered - self.use_start_clusters = config['use_start_clusters'] - self.use_trip_clusters = config['use_trip_clusters'] - self.base_features = [ - 'duration', - 'distance', - 'start_local_dt_year', - 'start_local_dt_month', - 'start_local_dt_day', - 'start_local_dt_hour', - 'start_local_dt_weekday', - 'end_local_dt_year', # most likely the same as the start year - 'end_local_dt_month', # most likely the same as the start month - 'end_local_dt_day', - 'end_local_dt_hour', - 'end_local_dt_weekday', - ] - self.targets = ['mode_true', 'purpose_true', 'replaced_true'] - - if self.loc_feature == 'cluster': - # clustering algorithm to generate end clusters - self.end_cluster_model = DBSCANSVMCluster( - loc_type='end', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_start_clusters or self.use_trip_clusters: - # clustering algorithm to generate start clusters - self.start_cluster_model = DBSCANSVMCluster( - loc_type='start', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_trip_clusters: - # helper class to generate trip-level clusters - self.trip_grouper = TripGrouper( - start_cluster_col='start_cluster_idx', - end_cluster_col='end_cluster_idx') - - # wrapper class to generate one-hot encodings for cluster indices - self.cluster_enc = OneHotWrapper(sparse=False, - handle_unknown='ignore') - - # wrapper class to generate one-hot encodings for purposes and modes - self.purpose_enc = OneHotWrapper(impute_missing=True, - sparse=False, - handle_unknown='error') - self.mode_enc = OneHotWrapper(impute_missing=True, - sparse=False, - handle_unknown='error') - - # ensemble classifiers for each label category - self.purpose_predictor = RandomForestClassifier( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - bootstrap=self.bootstrap, - random_state=self.random_state) - self.mode_predictor = RandomForestClassifier( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - bootstrap=self.bootstrap, - random_state=self.random_state) - self.replaced_predictor = RandomForestClassifier( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - bootstrap=self.bootstrap, - random_state=self.random_state) - - -class TripGrouper(): - """ Helper class to get trip clusters from start and end clusters. - - Args: - start_cluster_col (str): name of the column containing start - cluster indices - end_cluster_col (str): name of the column containing end cluster - indices - """ - - def __init__(self, - start_cluster_col='start_cluster_idx', - end_cluster_col='end_cluster_idx'): - self.start_cluster_col = start_cluster_col - self.end_cluster_col = end_cluster_col - - def fit_transform(self, trip_df): - """ Fit and remember possible trip clusters. - - Args: - trip_df (DataFrame): DataFrame containing trips. must have - columns and - """ - trip_groups = trip_df.groupby( - [self.start_cluster_col, self.end_cluster_col]) - - # need dict so we can access the trip indices of all the trips in each - # group. the key is the group tuple and the value is the list of trip - # indices in the group. - self.trip_groups_dict = dict(trip_groups.groups) - - # we want to convert trip-group tuples to to trip-cluster indices, - # hence the pd Series - trip_groups_series = pd.Series(list(self.trip_groups_dict.keys())) - - trip_cluster_idx = np.empty(len(trip_df)) - - for group_idx in range(len(trip_groups_series)): - group_tuple = trip_groups_series[group_idx] - trip_idxs_in_group = self.trip_groups_dict[group_tuple] - trip_cluster_idx[trip_idxs_in_group] = group_idx - - return trip_cluster_idx - - def transform(self, new_trip_df): - """ Get trip clusters for a new set of trips. - - Args: - new_trip_df (DataFrame): DataFrame containing trips. must have - columns and - """ - prediction_trip_groups = new_trip_df.groupby( - [self.start_cluster_col, self.end_cluster_col]) - - # need dict so we can access the trip indices of all the trips in each - # group. the key is the group tuple and the value is the list of trip - # indices in the group. - prediction_trip_groups_dict = dict(prediction_trip_groups.groups) - trip_groups_series = pd.Series(list(self.trip_groups_dict.keys())) - trip_cluster_idx = np.empty(len(new_trip_df)) - - for group_tuple in dict(prediction_trip_groups.groups).keys(): - # check if the trip cluster exists in the training set - trip_idxs_in_group = prediction_trip_groups_dict[group_tuple] - if group_tuple in self.trip_groups_dict.keys(): - # look up the group index from the series we created when we - # fit the model - group_idx = trip_groups_series[trip_groups_series == - group_tuple].index[0] - else: - group_idx = -1 - - trip_cluster_idx[trip_idxs_in_group] = group_idx - - return trip_cluster_idx - - - -class OneHotWrapper(): - """ Helper class to streamline one-hot encoding. - - Args: - impute_missing (bool): whether or not to impute np.nan values. - sparse (bool): whether or not to return a sparse matrix. - handle_unknown (str): specifies the way unknown categories are - handled during transform. - """ - - def __init__( - self, - impute_missing=False, - sparse=False, - handle_unknown='ignore', - ): - self.impute_missing = impute_missing - if self.impute_missing: - self.encoder = make_pipeline( - SimpleImputer(missing_values=np.nan, - strategy='constant', - fill_value='missing'), - OneHotEncoder(sparse=False, handle_unknown=handle_unknown)) - else: - self.encoder = OneHotEncoder(sparse=sparse, - handle_unknown=handle_unknown) - - def fit_transform(self, train_df, output_col_prefix=None): - """ Establish one-hot encoded variables. - - Args: - train_df (DataFrame): DataFrame containing train trips. - output_col_prefix (str): only if train_df is a single column - """ - # TODO: handle pd series - - train_df = train_df.copy() # to avoid SettingWithCopyWarning - - # if imputing, the dtype of each column must be string/object and not - # numerical, otherwise the SimpleImputer will fail - if self.impute_missing: - for col in train_df.columns: - train_df[col] = train_df[col].astype(object) - onehot_encoding = self.encoder.fit_transform(train_df) - self.onehot_encoding_cols_all = [] - for col in train_df.columns: - if train_df.shape[1] > 1 or output_col_prefix is None: - output_col_prefix = col - self.onehot_encoding_cols_all += [ - f'{output_col_prefix}_{val}' - for val in np.sort(train_df[col].dropna().unique()) - ] - # we handle np.nan separately because it is of type float, and may - # cause issues with np.sort if the rest of the unique values are - # strings - if any((train_df[col].isna())): - self.onehot_encoding_cols_all += [f'{output_col_prefix}_nan'] - - onehot_encoding_df = pd.DataFrame( - onehot_encoding, - columns=self.onehot_encoding_cols_all).set_index(train_df.index) - - # ignore the encoded columns for missing entries - self.onehot_encoding_cols = copy.deepcopy(self.onehot_encoding_cols_all) - for col in self.onehot_encoding_cols_all: - if col.endswith('_nan'): - onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) - self.onehot_encoding_cols.remove(col) - - return onehot_encoding_df.astype(int) - - def transform(self, test_df): - """ One-hot encoded features in accordance with features seen in the - train set. - - Args: - test_df (DataFrame): DataFrame of trips. - """ - # TODO: rename test_df, this one doesn't necessarily need to be a df - onehot_encoding = self.encoder.transform(test_df) - onehot_encoding_df = pd.DataFrame( - onehot_encoding, - columns=self.onehot_encoding_cols_all).set_index(test_df.index) - - # ignore the encoded columns for missing entries - for col in self.onehot_encoding_cols_all: - if col.endswith('_nan'): - onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) - - return onehot_encoding_df.astype(int) From 94fc848be0de97d8f2196a36cc0ed5d7ac801276 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Fri, 3 Nov 2023 18:28:33 -0400 Subject: [PATCH 15/28] Update model.py --- .../models.py => emission/analysis/modelling/trip_model/model.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename TRB_label_assist/models.py => emission/analysis/modelling/trip_model/model.py (100%) diff --git a/TRB_label_assist/models.py b/emission/analysis/modelling/trip_model/model.py similarity index 100% rename from TRB_label_assist/models.py rename to emission/analysis/modelling/trip_model/model.py From 33cdaab95c927497fba1c39db83adc5c786dd727 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Sat, 9 Dec 2023 00:01:59 -0500 Subject: [PATCH 16/28] [Tested, Will fail]Integrating RF model on server and more Unit test This will fail due to testForest.py file. Changes here include : 1. Integrated the shifting of randomForest model from eval to server. 2. unit tests for Model save and load 3. RegressionTest for RF model in testRandomForest.py. --- .../modelling/trip_model/forest_classifier.py | 45 ++- .../modelling/trip_model/model_type.py | 2 +- .../tests/modellingTests/TestForestModel.py | 250 ++++++-------- .../TestForestModelLoadandSave.py | 317 ++++++++++++++++++ .../modellingTests/TestRunForestModel.py | 23 +- 5 files changed, 471 insertions(+), 166 deletions(-) create mode 100644 emission/tests/modellingTests/TestForestModelLoadandSave.py diff --git a/emission/analysis/modelling/trip_model/forest_classifier.py b/emission/analysis/modelling/trip_model/forest_classifier.py index 5a23d867e..a8d1dd2de 100644 --- a/emission/analysis/modelling/trip_model/forest_classifier.py +++ b/emission/analysis/modelling/trip_model/forest_classifier.py @@ -11,11 +11,11 @@ import emission.analysis.modelling.trip_model.config as eamtc import emission.storage.timeseries.builtin_timeseries as estb import emission.storage.decorations.trip_queries as esdtq -from emission.analysis.modelling.trip_model.models import ForestClassifierModel +from emission.analysis.modelling.trip_model.models import ForestClassifier EARTH_RADIUS = 6371000 -class ForestClassifier(eamuu.TripModel): +class ForestClassifierModel(eamuu.TripModel): def __init__(self,config=None): @@ -54,7 +54,24 @@ def __init__(self,config=None): if config.get(k) is None: msg = f"cluster trip model config missing expected key {k}" raise KeyError(msg) - self.model=ForestClassifierModel(config=config) + maxdepth =config['max_depth'] if config['max_depth']!='null' else None + self.model=ForestClassifier( loc_feature=config['loc_feature'], + radius= config['radius'], + size_thresh=config['radius'], + purity_thresh=config['purity_thresh'], + gamma=config['gamma'], + C=config['C'], + n_estimators=config['n_estimators'], + criterion=config['criterion'], + max_depth=maxdepth, + min_samples_split=config['min_samples_split'], + min_samples_leaf=config['min_samples_leaf'], + max_features=config['max_features'], + bootstrap=config['bootstrap'], + random_state=config['random_state'], + # drop_unclustered=False, + use_start_clusters=config['use_start_clusters'], + use_trip_clusters=config['use_trip_clusters']) def fit(self,trips: List[ecwc.Confirmedtrip]): @@ -89,7 +106,7 @@ def predict(self, trip: List[float]) -> Tuple[List[Dict], int]: msg = f'model.predict cannot be called with an empty trips' raise Exception(msg) # CONVERT LIST OF TRIPS TO dataFrame - test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",[trip]) + test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trip) labeled_trip_df = esdtq.filter_labeled_trips(test_df) expanded_labeled_trip_df= esdtq.expand_userinputs(labeled_trip_df) predcitions_df= self.model.predict(expanded_labeled_trip_df) @@ -128,8 +145,14 @@ def to_dict(self): ## confirm this includes all the extra encoders/models attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper']) for attribute_name in attr: + if not hasattr(self.model,attribute_name): + raise ValueError(f"Attribute {attribute_name} not found in the model") + buffer=BytesIO() - joblib.dump(getattr(self.model,attribute_name),buffer) + try: + joblib.dump(getattr(self.model,attribute_name),buffer) + except Exception as e: + raise RuntimeError(f"Error serializing { attribute_name}: {str(e)}") buffer.seek(0) data[attribute_name]=buffer.getvalue() @@ -144,14 +167,14 @@ def from_dict(self,model: Dict): ## TODO : confirm this includes all the extra encoders/models attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper']) for attribute_name in attr: + if attribute_name not in model: + raise ValueError(f"Attribute {attribute_name} missing in the model") try: - if attribute_name in model: - buffer = BytesIO(model[attribute_name]) - setattr(self.model,attribute_name, joblib.load(buffer)) + buffer = BytesIO(model[attribute_name]) + setattr(self.model,attribute_name, joblib.load(buffer)) except Exception as e: - print(f"Error loading {attribute_name}: {str(e)}") - # If we do not wish to raise the exception after logging the error, comment the line below - raise e + raise RuntimeError(f"Error deserializing { attribute_name}: {str(e)}") + # If we do not wish to raise the exception after logging the error, comment the line above def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: """ diff --git a/emission/analysis/modelling/trip_model/model_type.py b/emission/analysis/modelling/trip_model/model_type.py index 56268a51a..16f27ae78 100644 --- a/emission/analysis/modelling/trip_model/model_type.py +++ b/emission/analysis/modelling/trip_model/model_type.py @@ -28,7 +28,7 @@ def build(self, config=None) -> eamuu.TripModel: # Dict[ModelType, TripModel] MODELS = { ModelType.GREEDY_SIMILARITY_BINNING: eamug.GreedySimilarityBinning, - ModelType.RANDOM_FOREST_CLASSIFIER: eamuf.ForestClassifier + ModelType.RANDOM_FOREST_CLASSIFIER: eamuf.ForestClassifierModel } model = MODELS.get(self) if model is None: diff --git a/emission/tests/modellingTests/TestForestModel.py b/emission/tests/modellingTests/TestForestModel.py index 8895cb366..58e96252b 100644 --- a/emission/tests/modellingTests/TestForestModel.py +++ b/emission/tests/modellingTests/TestForestModel.py @@ -1,25 +1,23 @@ import unittest import logging +import numpy as np +import uuid +import json +import os -import emission.analysis.modelling.trip_model.model_storage as eamums -import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.run_model as eamur +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.storage.json_wrappers as esj import emission.storage.timeseries.abstract_timeseries as esta import emission.tests.modellingTests.modellingTestAssets as etmm import emission.storage.decorations.analysis_timeseries_queries as esda import emission.core.get_database as edb -import emission.storage.pipeline_queries as epq -import emission.core.wrapper.pipelinestate as ecwp -import numpy as np +import emission.core.wrapper.entry as ecwe +import emission.storage.decorations.analysis_timeseries_queries as esdatq -class TestRunForestModel(unittest.TestCase): - """these tests were copied forward during a refactor of the tour model - [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] +class TestForestModel(unittest.TestCase): - it's uncertain what condition they are in besides having been refactored to - use the more recent tour modeling code. - """ - def setUp(self): """ sets up the end-to-end run model test with Confirmedtrip data @@ -27,58 +25,42 @@ def setUp(self): logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.DEBUG) - # configuration for randomly-generated test data - self.user_id = user_id = 'TestRunForestModel-TestData' - self.origin = (-105.1705977, 39.7402654,) - self.destination = (-105.1755606, 39.7673075) - self.min_trips = 14 - self.total_trips = 100 - self.clustered_trips = 33 # must have at least self.min_trips similar trips by default - self.has_label_percent = 0.9 # let's make a few that don't have a label, but invariant - # $clustered_trips * $has_label_percent > self.min_trips - # must be correct or else this test could fail under some random test cases. - - # for a negative test, below - self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' - - # test data can be saved between test invocations, check if data exists before generating - ts = esta.TimeSeries.get_time_series(user_id) - test_data = list(ts.find_entries(["analysis/confirmed_trip"])) - if len(test_data) == 0: - # generate test data for the database - logging.debug(f"inserting mock Confirmedtrips into database") - - # generate labels with a known sample weight that we can rely on in the test - label_data = { - "mode_confirm": ['ebike', 'bike'], - "purpose_confirm": ['happy-hour', 'dog-park'], - "replaced_mode": ['walk'], - "mode_weights": [0.9, 0.1], - "purpose_weights": [0.1, 0.9] - } - - train = etmm.generate_mock_trips( - user_id=user_id, - trips=self.total_trips, - origin=self.origin, - destination=self.destination, - trip_part='od', - label_data=label_data, - within_threshold=self.clustered_trips, - threshold=0.004, # ~400m - has_label_p=self.has_label_percent - ) - - ts.bulk_insert(train) - - # confirm data write did not fail - test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) - if len(test_data) != self.total_trips: - logging.debug(f'test invariant failed after generating test data') - self.fail() - else: - logging.debug(f'found {self.total_trips} trips in database') - + self.user_id = uuid.UUID('aa9fdec9-2944-446c-8ee2-50d79b3044d3') + self.ts = esta.TimeSeries.get_time_series(self.user_id) + self.new_trips_per_invocation = 3 + self.model_type = eamumt.ModelType.RANDOM_FOREST_CLASSIFIER + self.model_storage = eamums.ModelStorage.DOCUMENT_DATABASE + sim_threshold = 500 # meters + self.forest_model_config= { + "loc_feature" : "coordinates", + "radius": 500, + "size_thresh":1, + "purity_thresh":1.0, + "gamma":0.05, + "C":1, + "n_estimators":100, + "criterion":"gini", + "max_depth":'null', + "min_samples_split":2, + "min_samples_leaf":1, + "max_features":"sqrt", + "bootstrap":True, + "random_state":42, + "use_start_clusters":False, + "use_trip_clusters":True + } + + existing_entries_for_user = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) + if len(existing_entries_for_user) != 0: + raise Exception(f"test invariant failed, there should be no entries for user {self.user_id}") + + # load in trips from a test file source + input_file = 'emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips' + with open(input_file, 'r') as f: + trips_json = json.load(f, object_hook=esj.wrapped_object_hook) + self.trips = [ecwe.Entry(r) for r in trips_json] + logging.debug(f'loaded {len(self.trips)} trips from {input_file}') + def tearDown(self): """ clean up database @@ -88,86 +70,64 @@ def tearDown(self): edb.get_pipeline_state_db().delete_many({'user_id': self.user_id}) -# def test_model_consistency(self): -# """ -# Test to ensure that the model's predictions on the mock data remain consistent. -# """ -# # Get the mock data from the parent class's setup -# mock_data = self.mock_data - -# # Predict using the model -# current_predictions = eamur.predict_labels_with_n( -# trip=mock_data, -# model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, -# model_storage=eamums.ModelStorage.DOCUMENT_DATABASE -# ) # assuming this is how you get predictions -# ## TODO : -# # Check if there are any previously stored predictions -# stored_predictions = list(self.collection.find({})) - -# if len(stored_predictions) == 0: -# # If not, store the current predictions as the ground truth -# self.collection.insert_many([{"index": i, "prediction": p} for i, p in enumerate(current_predictions)]) -# logging.debug("Stored current model predictions as ground truth.") -# else: -# # If there are stored predictions, compare them with the current predictions -# for stored_pred in stored_predictions: -# index, stored_value = stored_pred["index"], stored_pred["prediction"] -# current_value = current_predictions[index] - -# self.assertEqual(stored_value, current_value, f"Prediction at index {index} has changed! Expected {stored_value}, but got {current_value}.") - -# logging.debug("Model predictions are consistent with previously stored predictions.") -## TODO : Fix regression Tests - - # def test_regression(self): - # """ - # Regression test to ensure consistent model results. - # """ - # # Load the previously stored predictions (if any) - # previous_predictions = self.load_previous_predictions() + def testRandomForestRegression(self): + """ + test to ensure consistent model results. Load data for a user from json, split + into train and test. After training, we generate predictions and match them with + predictions from last time. If the code is run for the first time, the current predicitons + will be stored as ground truth. + """ + file_path= 'emission/tests/modellingTests/data.json' + split=int(0.9*len(self.trips)) + train_data= self.trips[:split] + + self.ts.bulk_insert(train_data) + + # confirm write to database succeeded + self.initial_data = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) + if len(self.initial_data) == 0: + logging.debug(f'Writing train data failed') + self.fail() + + test_data=self.trips[split:] + logging.debug(f'LENDATA{len(train_data),len(test_data)}') + eamur.update_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=4, + model_config=self.forest_model_config + ) + model = eamur._load_stored_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + model_config=self.forest_model_config + ) - # # Run the current model to get predictions - # current_predictions = self.run_current_model() - - # # If there are no previous predictions, store the current predictions - # if previous_predictions is None: - # self.store_predictions(current_predictions) - # else: - # # Compare the current predictions with the previous predictions - # self.assertPredictionsMatch(previous_predictions, current_predictions) - - # def load_previous_predictions(self): - # # Retrieve stored predictions from the database - # # Using get_analysis_timeseries_db as an example, replace with the correct method if needed - # db = edb.get_analysis_timeseries_db() - # predictions = db.find_one({"user_id": self.user_id, "metadata.key": "predictions"}) - # return predictions - - # def run_current_model(self): - # # Placeholder: Run the current model and get predictions - # # Replace this with the actual model running code - # predictions = None - # return predictions - - # def store_predictions(self, predictions): - # # Store the predictions in the database - # # Using get_analysis_timeseries_db as an example, replace with the correct method if needed - # db = edb.get_analysis_timeseries_db() - # entry = { - # "user_id": self.user_id, - # "metadata": { - # "key": "predictions", - # "write_ts": pd.Timestamp.now().timestamp() # Using pandas timestamp as an example - # }, - # "data": predictions - # } - # db.insert_one(entry) - - # def assertPredictionsMatch(self, prev, curr): - # # Placeholder: Check if the predictions match - # # This will depend on the format and type of your predictions - # # For example, if predictions are lists or arrays, you can use numpy - # if not np.array_equal(prev, curr): - # self.fail("Current model predictions do not match previously stored predictions!") + curr_predictions_list = eamur.predict_labels_with_n( + trip_list = [test_data], + model=model + ) + + + ## predictions take the form like : + # + #{'labels': {'mode_confirm': 'ebike', 'replaced_mode': 'walk', 'purpose_confirm': 'dog-park'}, 'p': 1.0} + # we can store these predictions in a json and then for every run other than the first we + # can load the predictions and compare + + try: + if os.path.exists(file_path) and os.path.getsize(file_path)>0: + with open(file_path, 'r') as f: + prev_predictions_list = json.load(f) + logging.debug() + self.assertEqual(prev_predictions_list,curr_predictions_list," previous predictions should match current predictions") + else: + with open(file_path,'w') as file: + json.dump(curr_predictions_list,file,indent=4) + logging.debug("Previous predicitons stored for future matching" ) + except json.JSONDecodeError: + logging.debug("jsonDecodeErrorError") + return " decoding JSON." \ No newline at end of file diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py new file mode 100644 index 000000000..e9aad23d5 --- /dev/null +++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py @@ -0,0 +1,317 @@ +from typing import ByteString +import unittest +import logging +import pytest +from unittest.mock import patch +import emission.analysis.modelling.trip_model.run_model as eamur +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.model_storage as eamums + +import emission.storage.timeseries.abstract_timeseries as esta +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.core.get_database as edb +import emission.storage.pipeline_queries as epq +import emission.core.wrapper.pipelinestate as ecwp + + +class TestForestModelLoadandSave(unittest.TestCase): + """these tests were copied forward during a refactor of the tour model + [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] + + it's uncertain what condition they are in besides having been refactored to + use the more recent tour modeling code. + """ + + def setUp(self): + """ + sets up the end-to-end run model test with Confirmedtrip data + """ + logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', + level=logging.DEBUG) + + # configuration for randomly-generated test data + self.user_id = user_id = 'TestForestModelLoadAndSave-TestData' + self.origin = (-105.1705977, 39.7402654,) + self.destination = (-105.1755606, 39.7673075) + self.min_trips = 14 + self.total_trips = 100 + self.clustered_trips = 33 # must have at least self.min_trips similar trips by default + self.has_label_percent = 0.9 # let's make a few that don't have a label, but invariant + # $clustered_trips * $has_label_percent > self.min_trips + # must be correct or else this test could fail under some random test cases. + + # for a negative test, below + self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' + + # test data can be saved between test invocations, check if data exists before generating + ts = esta.TimeSeries.get_time_series(user_id) + test_data = list(ts.find_entries(["analysis/confirmed_trip"])) + if len(test_data) == 0: + # generate test data for the database + logging.debug(f"inserting mock Confirmedtrips into database") + + # generate labels with a known sample weight that we can rely on in the test + label_data = { + "mode_confirm": ['ebike', 'bike'], + "purpose_confirm": ['happy-hour', 'dog-park'], + "replaced_mode": ['walk'], + "mode_weights": [0.9, 0.1], + "purpose_weights": [0.1, 0.9] + } + + train = etmm.generate_mock_trips( + user_id=user_id, + trips=self.total_trips, + origin=self.origin, + destination=self.destination, + trip_part='od', + label_data=label_data, + within_threshold=self.clustered_trips, + threshold=0.004, # ~400m + has_label_p=self.has_label_percent + ) + + ts.bulk_insert(train) + + # confirm data write did not fail + test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) + if len(test_data) != self.total_trips: + logging.debug(f'test invariant failed after generating test data') + self.fail() + else: + logging.debug(f'found {self.total_trips} trips in database') + + self.forest_model_config= { + "loc_feature" : "coordinates", + "radius": 500, + "size_thresh":1, + "purity_thresh":1.0, + "gamma":0.05, + "C":1, + "n_estimators":100, + "criterion":"gini", + "max_depth":'null', + "min_samples_split":2, + "min_samples_leaf":1, + "max_features":"sqrt", + "bootstrap":True, + "random_state":42, + "use_start_clusters":False, + "use_trip_clusters":True + } + + def tearDown(self): + """ + clean up database + """ + edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id}) + edb.get_model_db().delete_many({'user_id': self.user_id}) + edb.get_pipeline_state_db().delete_many({'user_id': self.user_id}) + + def testForestModelRoundTrip(self): + """ + RoundTripTest : Serialising an object with 'to_dict' and then immediately + deserialize it with 'from_dict'. After deserialization, the object should have + the same state as original + """ + +# logging.debug(f'creating Random Forest model based on trips in database') + eamur.update_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips, + model_config=self.forest_model_config + ) + + model = eamur._load_stored_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + model_config=self.forest_model_config + ) + +# logging.debug(f'Loading test data') + test = esda.get_entries(key="analysis/confirmed_trip", user_id=self.user_id, time_query=None) + +# logging.debug(f'Predictions on trips in database') + + predictions_list = eamur.predict_labels_with_n( + trip_list = [test], + model=model + ) + + # logging.debug(f'Serialising the model ') + + model_data=model.to_dict() + +# logging.debug(f'Deserialising the model') + + + deserialized_model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER + deserialized_model = deserialized_model_type.build(self.forest_model_config) + deserialized_model.from_dict(model_data) + +# logging.debug(f'Predictions on trips using deserialised model') + predictions_loaded_model_list = eamur.predict_labels_with_n( + trip_list = [test], + model=deserialized_model + ) +# logging.debug(f'Assert that both predictions are the same') + self.assertEqual(predictions_list, predictions_loaded_model_list, " should be equal") + + def testForestModelConsistency(self): + """ + ConsistencyTest : To Verify that the serialization and deserialization process + is consistent across multiple executions + """ + # logging.debug(f'creating a model based on trips in database') + eamur.update_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips, + model_config=self.forest_model_config + ) + + model_iter1 = eamur._load_stored_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + model_config=self.forest_model_config + ) + + # logging.debug(f'Load Test data') + test = esda.get_entries(key="analysis/confirmed_trip", user_id=self.user_id, time_query=None) + + # logging.debug(f' Model Predictions on trips in database') + + predictions_list_model1 = eamur.predict_labels_with_n( + trip_list = [test], + model=model_iter1 + ) + # logging.debug(f' Loading Model again') + + model_iter2 = eamur._load_stored_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + model_config=self.forest_model_config + ) + # logging.debug(f' Model Predictions on trips in database') + predictions_list_model2 = eamur.predict_labels_with_n( + trip_list = [test], + model=model_iter2 + ) + + self.assertEqual(predictions_list_model1, predictions_list_model2, " should be equal") + + + + def testSerializationErrorHandling(self): + """ + SerialisationErrorHandling : To verify that any errors during + serialising an object with 'to_dict' are handled. + """ + # defining a side effect function to simulate a serialization error + def mock_dump(*args,**kwargs): + raise Exception("Serialization Error") + + logging.debug(f'(TRAIN) creating a model based on trips in database') + eamur.update_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips, + model_config=self.forest_model_config + ) + + model = eamur._load_stored_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + model_config=self.forest_model_config + ) + # patch is used to temporarily replace joblib.dump with a + # mock function that raises an exception + # + # side_effect, which is set to mock_dump, is called instead of + # real joblib.dump function when 'to_dict' is invoked + + with patch('joblib.dump',side_effect=mock_dump): + with self.assertRaises(RuntimeError): + model.to_dict() + + + def testDeserializationErrorHandling(self): + """ + deserialisationErrorHandling : To verify that any errors during + deserialising an object with 'from_dict' are handled. + """ + # defining a side effect function to simulate a deserialization error + def mock_load(*args,**kwargs): + raise Exception("Deserialization Error") + + logging.debug(f'(TRAIN) creating a model based on trips in database') + eamur.update_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips, + model_config=self.forest_model_config + ) + + model = eamur._load_stored_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + model_config=self.forest_model_config + ) + + model_data=model.to_dict() + + deserialized_model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER + deserialized_model = deserialized_model_type.build(self.forest_model_config) + # patch is used to temporarily replace joblib.load with a + # mock function that raises an exception + # + # side_effect, which is set to mock_load, is called instead of + # real joblib.load function when 'to_dict' is invoked + + with patch('joblib.load',side_effect=mock_load): + with self.assertRaises(RuntimeError): + deserialized_model.from_dict(model_data) + + + def testRandomForestTypePreservation(self): + """ + TypePreservationTest: To ensure that the serialization and deserialization + process maintains the data types of all model attributes. + """ + + logging.debug(f'(TRAIN) creating a model based on trips in database') + eamur.update_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=self.min_trips, + model_config=self.forest_model_config + ) + + model = eamur._load_stored_trip_model( + user_id=self.user_id, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + model_config=self.forest_model_config + ) + + model_data=model.to_dict() + loaded_model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER + loaded_model = loaded_model_type.build(self.forest_model_config) + loaded_model.from_dict(model_data) + + + for attr in ['purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df']: + assert isinstance(getattr(loaded_model.model,attr),type(getattr(model.model,attr))) + diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py index 2edcc92de..edbc7bd7e 100644 --- a/emission/tests/modellingTests/TestRunForestModel.py +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -1,9 +1,10 @@ import unittest import logging -import emission.analysis.modelling.trip_model.model_storage as eamums -import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.run_model as eamur +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.model_storage as eamums + import emission.storage.timeseries.abstract_timeseries as esta import emission.tests.modellingTests.modellingTestAssets as etmm import emission.storage.decorations.analysis_timeseries_queries as esda @@ -176,13 +177,17 @@ def test1RoundPredictForestModel(self): logging.debug(f'(TEST) testing prediction of stored model') test = esda.get_entries(key="analysis/confirmed_trip", user_id=self.user_id, time_query=None) - prediction, n = eamur.predict_labels_with_n( - trip = test[0], + model = eamur._load_stored_trip_model( + user_id=self.user_id, model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, - model_config=forest_model_config + model_config=forest_model_config + ) + + predictions_list = eamur.predict_labels_with_n( + trip_list = [test], + model=model ) - - [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)] - - self.assertNotEqual(len(prediction), 0, "should have a prediction") + for prediction, n in predictions_list: + [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)] + self.assertNotEqual(len(prediction), 0, "should have a prediction") From 01fcb2a4788bbbc6de82d5f0699a73b030b14d35 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Sat, 9 Dec 2023 00:25:06 -0500 Subject: [PATCH 17/28] minor fix --- .../tests/modellingTests/TestForestModelLoadandSave.py | 7 ++----- emission/tests/modellingTests/TestRunForestModel.py | 8 +++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py index e9aad23d5..dddbb160c 100644 --- a/emission/tests/modellingTests/TestForestModelLoadandSave.py +++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py @@ -16,11 +16,8 @@ class TestForestModelLoadandSave(unittest.TestCase): - """these tests were copied forward during a refactor of the tour model - [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] - - it's uncertain what condition they are in besides having been refactored to - use the more recent tour modeling code. + """ + Tests to make sure the model load and save properly """ def setUp(self): diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py index edbc7bd7e..2ca48c4f4 100644 --- a/emission/tests/modellingTests/TestRunForestModel.py +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -14,11 +14,9 @@ class TestRunForestModel(unittest.TestCase): - """these tests were copied forward during a refactor of the tour model - [https://github.com/e-mission/e-mission-server/blob/10772f892385d44e11e51e796b0780d8f6609a2c/emission/analysis/modelling/tour_model_first_only/load_predict.py#L114] - - it's uncertain what condition they are in besides having been refactored to - use the more recent tour modeling code. + """ + Tests to ensure Pipeline builds and runs with zero + or more trips """ def setUp(self): From f5fec64263287c12182072d23ed44c083d9fe1f6 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Tue, 12 Dec 2023 17:14:04 -0500 Subject: [PATCH 18/28] Delete model.py This is replaced by models.py (movied with history) --- .../analysis/modelling/trip_model/model.py | 2063 ----------------- 1 file changed, 2063 deletions(-) delete mode 100644 emission/analysis/modelling/trip_model/model.py diff --git a/emission/analysis/modelling/trip_model/model.py b/emission/analysis/modelling/trip_model/model.py deleted file mode 100644 index 475f0b6d0..000000000 --- a/emission/analysis/modelling/trip_model/model.py +++ /dev/null @@ -1,2063 +0,0 @@ -import pandas as pd -import numpy as np -from abc import ABCMeta, abstractmethod # to define abstract class "blueprints" -import logging -import copy - -# sklearn imports -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler, OneHotEncoder -from sklearn.impute import SimpleImputer -from sklearn.metrics.pairwise import haversine_distances -from sklearn.cluster import DBSCAN -from sklearn import svm -from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier -from sklearn.tree import DecisionTreeClassifier -from sklearn.exceptions import NotFittedError - -# our imports -from clustering import get_distance_matrix, single_cluster_purity -import data_wrangling -import emission.storage.decorations.trip_queries as esdtq -from emission.analysis.classification.inference.labels.inferrers import predict_cluster_confidence_discounting -import emission.core.wrapper.entry as ecwe -import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg -import emission.core.common as ecc -import emission.analysis.modelling.trip_model.model_storage as eamums -import emission.analysis.modelling.trip_model.model_type as eamumt -import emission.analysis.modelling.trip_model.run_model as eamur - - -import clustering -# NOTE: tour_model_extended.similarity is on the -# eval-private-data-compatibility branch in e-mission-server - -# logging.basicConfig(level=logging.DEBUG) - -EARTH_RADIUS = 6371000 - -############################# -## define abstract classes ## -############################# - - -class SetupMixin(metaclass=ABCMeta): - """ class containing code to be reused when setting up estimators. """ - - @abstractmethod - def set_params(self, params): - """ Set the parameters of the estimator. - - Args: - params (dict): dictionary where the keys are the param names - (strings) and the values are the parameter inputs - - Returns: - self - """ - raise NotImplementedError - - def _clean_data(self, df): - """ Clean a dataframe of trips. - (Drop trips with missing start/end locations, expand the user input - columns, ensure all essential columns are present) - - Args: - df: a dataframe of trips. must contain the columns 'start_loc', - 'end_loc', and should also contain the user input columns - ('mode_confirm', 'purpose_confirm', 'replaced_mode') if - available - """ - assert 'start_loc' in df.columns and 'end_loc' in df.columns - - # clean up the dataframe by dropping entries with NaN locations and - # reset index - num_nan = 0 - if df.start_loc.isna().any(): - num_nan += df.start_loc.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['start_loc']) - if df.end_loc.isna().any(): - num_nan += df.end_loc.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['end_loc']) - - # expand the 'start_loc' and 'end_loc' column into 'start_lat', - # 'start_lon', 'end_lat', and 'end_lon' columns - df = data_wrangling.expand_coords(df) - - # drop trips with missing coordinates - if df.start_lat.isna().any(): - num_nan += df.start_lat.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['start_lat']) - if df.start_lon.isna().any(): - num_nan += df.start_lon.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['start_lon']) - if df.end_lat.isna().any(): - num_nan += df.end_lat.value_counts(dropna=False).loc[np.nan] - df = df.dropna(subset=['end_lat']) - if df.end_lon.isna().any(): - num_nan = df.end_lon.value_counts(dropna=False).loc[np.nan] - df += df.dropna(subset=['end_lon']) - if num_nan > 0: - logging.info( - f'dropped {num_nan} trips that are missing location coordinates' - ) - - df = df.rename( - columns={ - 'mode_confirm': 'mode_true', - 'purpose_confirm': 'purpose_true', - 'replaced_mode': 'replaced_true' - }) - - for category in ['mode_true', 'purpose_true', 'replaced_true']: - if category not in df.columns: - # for example, if a user labels all their trip modes but none of their trip purposes - df.loc[:, category] = np.nan - - return df.reset_index(drop=True) - - -class Cluster(SetupMixin, metaclass=ABCMeta): - """ blueprint for clustering models. """ - - @abstractmethod - def fit(self, train_df,ct_entry=None): - """ Fit the clustering algorithm. - - Args: - train_df (DataFrame): dataframe of labeled trips - ct_entry (List) : A list of Entry type of labeled and unlabeled trips - - Returns: - self - """ - raise NotImplementedError - - @abstractmethod - def predict(self, test_df): - """ Predict cluster indices for trips, if possible. Trips that could - not be clustered will have the index -1. - - Args: - test_df (DataFrame): dataframe of test trips - - Returns: - pd DataFrame containing one column, 'start_cluster_idx' or - 'end_cluster_idx' - """ - raise NotImplementedError - - def fit_predict(self, train_df): - """ Fit the clustering algorithm and predict cluster indices for trips, - if possible. Trips that could not be clustered will have the index -1. - - Args: - train_df (DataFrame): dataframe of labeled trips - - Returns: - pd DataFrame containing one column, 'start_cluster_idx' or - 'end_cluster_idx' - """ - self.fit(train_df) - return self.predict(train_df) - - -class TripClassifier(SetupMixin, metaclass=ABCMeta): - - @abstractmethod - def fit(self, train_df,ct_entry=None): - """ Fit a classification model. - - Args: - train_df (DataFrame): dataframe of labeled trips - ct_entry (List) : A list of Entry type of labeled and unlabeled trips - - Returns: - self - """ - raise NotImplementedError - - def predict(self, test_df): - """ Predict trip labels. - - Args: - test_df (DataFrame): dataframe of trips - - Returns: - DataFrame containing the following columns: - 'purpose_pred', 'mode_pred', 'replaced_pred', - 'purpose_proba', 'mode_proba', 'replaced_proba' - the *_pred columns contain the most-likely label prediction - (string for a label or float for np.nan). - the *_proba columns contain the probability of the most-likely - prediction. - """ - proba_df = self.predict_proba(test_df) - prediction_df = proba_df.loc[:, [('purpose', 'top_pred'), - ('purpose', 'top_proba'), - ('mode', 'top_pred'), - ('mode', 'top_proba'), - ('replaced', 'top_pred'), - ('replaced', 'top_proba')]] - - prediction_df.columns = prediction_df.columns.to_flat_index() - prediction_df = prediction_df.rename( - columns={ - ('purpose', 'top_pred'): 'purpose_pred', - ('purpose', 'top_proba'): 'purpose_proba', - ('mode', 'top_pred'): 'mode_pred', - ('mode', 'top_proba'): 'mode_proba', - ('replaced', 'top_pred'): 'replaced_pred', - ('replaced', 'top_proba'): 'replaced_proba', - }) - - return prediction_df - - def fit_predict(self, train_df): - """ Fit a classification model and predict trip labels. - - Args: - train_df (DataFrame): dataframe of labeled trips - - Returns: - DataFrame containing the following columns: - 'purpose_pred', 'mode_pred', 'replaced_pred', - 'purpose_proba', 'mode_proba', 'replaced_proba' - the *_pred columns contain the most-likely label prediction - (string for a label or float for np.nan). - the *_proba columns contain the probability of the most-likely - prediction. - """ - self.fit(train_df) - return self.predict(train_df) - - @abstractmethod - def predict_proba(self, test_df): - """ Predict class probabilities for each trip. - - NOTE: check the specific model to see if the class probabilities - have confidence-discounting or not. - - Args: - test_df (DataFrame): dataframe of trips - - Returns: - DataFrame with multiindexing. Each row represents a trip. There - are 3 columns at level 1, one for each label category - ('purpose', 'mode', 'replaced'). Within each category, there is - a column for each label, with the row's entry being the - probability that the trip has the label. There are three - additional columns within each category, one indicating the - most-likely label, one indicating the probability of the - most-likely label, and one indicating whether or not the trip - can be clustered. - TODO: add a fourth optional column for the number of trips in - the cluster (if clusterable) - - Level 1 columns are: purpose, mode, replaced - Lebel 2 columns are: - , , ... top_pred, top_proba, clusterable - , , ... top_pred, top_proba, clusterable - , , ... top_pred, top_proba, clusterable - """ - raise NotImplementedError - - -######################## -## clustering classes ## -######################## - - -class RefactoredNaiveCluster(Cluster): - """ Naive fixed-width clustering algorithm. - Refactored from the existing Similarity class to take in dataframes for - consistency, and allows for separate clustering of start and end - clusters. - - WARNING: this algorithm is *extremely* slow. - - Args: - loc_type (str): 'start' or 'end', the type of point to cluster - radius (int): max distance between all pairs of points in a - cluster, i.e. strict maximum cluster width. - - Attributes: - loc_type (str) - radius (int) - train_df (DataFrame) - test_df (DataFrame) - sim_model (Similarity object) - """ - - def __init__(self, loc_type='end', radius=100): - logging.info("PERF: Initializing RefactoredNaiveCluster") - self.loc_type = loc_type - self.radius = radius - - def set_params(self, params): - if 'loc_type' in params.keys(): self.loc_type = params['loc_type'] - if 'radius' in params.keys(): self.radius = params['radius'] - - return self - - def fit(self, train_df,ct_entry=None): - # clean data - logging.info("PERF: Fitting RefactoredNaiveCluster with size %s" % len(train_df)) - self.train_df = self._clean_data(train_df) - - # we can use all trips as long as they have purpose labels. it's ok if - # they're missing mode/replaced-mode labels, because they aren't as - # strongly correlated with location compared to purpose - # TODO: actually, we may want to rethink this. for example, it will - # probably be helpful to include trips that are missing purpose labels - # but still have mode labels. - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - model_config = { - "metric": "od_similarity", - "similarity_threshold_meters": self.radius, # meters, - "apply_cutoff": False, - "clustering_way":'origin' if self.loc_type=='start' - else 'destination' if self.loc_type =='end' - else 'origin-destination', - "incremental_evaluation": False - } - - # fit the bins - self.sim_model= eamtg.GreedySimilarityBinning(model_config) - cleaned_trip_entry= clustering.cleanEntryTypeData(self.train_df,ct_entry) - self.sim_model.fit(cleaned_trip_entry) - - labels = [int(l) for l in self.sim_model.tripLabels] - self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = labels - return self - - def predict(self, test_df): - logging.info("PERF: Predicting RefactoredNaiveCluster for %s" % len(test_df)) - self.test_df = self._clean_data(test_df) - - if self.loc_type == 'start': - bins = self.sim_model.bins - elif self.loc_type == 'end': - bins = self.sim_model.bins - - labels = [] - - # for each trip in the test list: - for idx, row in self.test_df.iterrows(): - if idx % 100 == 0: - logging.info("PERF: RefactoredNaiveCluster Working on trip %s/%s" % (idx, len(self.test_df))) - # iterate over all bins - trip_binned = False - for i, bin in enumerate(bins): - # check if the trip can fit in the bin - # if so, get the bin index - if self._match(row, bin, self.loc_type): - labels += [i] - trip_binned = True - break - # if not, return -1 - if not trip_binned: - labels += [-1] - - self.test_df.loc[:, f'{self.loc_type}_cluster_idx'] = labels - - return self.test_df[[f'{self.loc_type}_cluster_idx']] - - def _match(self, trip, bin, loc_type): - """ Check if a trip can fit into an existing bin. - - copied from the Similarity class on the e-mission-server. - """ - for t_idx in bin: - trip_in_bin = self.train_df.iloc[int(t_idx)] - if not self._distance_helper(trip, trip_in_bin, loc_type): - return False - return True - - def _distance_helper(self, tripa, tripb, loc_type): - """ Check if two trips have start/end points within the distance - threshold. - - copied from the Similarity class on the e-mission-server. - """ - pta_lat = tripa[[loc_type + '_lat']] - pta_lon = tripa[[loc_type + '_lon']] - ptb_lat = tripb[[loc_type + '_lat']] - ptb_lon = tripb[[loc_type + '_lon']] - - dist= ecc.calDistance([pta_lon,pta_lat],[ptb_lon,ptb_lat]) - return dist <= self.radius - - -class DBSCANSVMCluster(Cluster): - """ DBSCAN-based clustering algorithm that optionally implements SVM - sub-clustering. - - Args: - loc_type (str): 'start' or 'end', the type of point to cluster - radius (int): max distance between two points in each other's - neighborhood, i.e. DBSCAN's eps value. does not strictly - dictate final cluster size - size_thresh (int): the min number of trips a cluster must have - to be considered for SVM sub-division - purity_thresh (float): the min purity a cluster must have - to be sub-divided using SVM - gamma (float): coefficient for the rbf kernel in SVM - C (float): regularization hyperparameter for SVM - - Attributes: - loc_type (str) - radius (int) - size_thresh (int) - purity_thresh (float) - gamma (float) - C (float) - train_df (DataFrame) - test_df (DataFrame) - base_model (sklearn Estimator) - """ - - def __init__(self, - loc_type='end', - radius=100, - svm=True, - size_thresh=1, - purity_thresh=1.0, - gamma=0.05, - C=1): - logging.info("PERF: Initializing DBSCANSVMCluster") - self.loc_type = loc_type - self.radius = radius - self.svm = svm - self.size_thresh = size_thresh - self.purity_thresh = purity_thresh - self.gamma = gamma - self.C = C - - def set_params(self, params): - if 'loc_type' in params.keys(): self.loc_type = params['loc_type'] - if 'radius' in params.keys(): self.radius = params['radius'] - if 'svm' in params.keys(): self.svm = params['svm'] - if 'size_thresh' in params.keys(): - self.size_thresh = params['size_thresh'] - if 'purity_thresh' in params.keys(): - self.purity_thresh = params['purity_thresh'] - if 'gamma' in params.keys(): self.gamma = params['gamma'] - - return self - - def fit(self, train_df,ct_entry=None): - """ Creates clusters of trip points. - self.train_df will be updated with columns containing base and - final clusters. - - TODO: perhaps move the loc_type argument to fit() so we can use a - single class instance to cluster both start and end points. This - will also help us reduce duplicate data. - - Args: - train_df (dataframe): dataframe of labeled trips - ct_entry (List) : A list of Entry type of labeled and unlabeled trips - """ - ################## - ### clean data ### - ################## - logging.info("PERF: Fitting DBSCANSVMCluster") - self.train_df = self._clean_data(train_df) - - # we can use all trips as long as they have purpose labels. it's ok if - # they're missing mode/replaced-mode labels, because they aren't as - # strongly correlated with location compared to purpose - # TODO: actually, we may want to rethink this. for example, it will - # probably be helpful to include trips that are missing purpose labels - # but still have mode labels. - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - ######################### - ### get base clusters ### - ######################### - dist_matrix_meters = get_distance_matrix(self.train_df, self.loc_type) - self.base_model = DBSCAN(self.radius, - metric="precomputed", - min_samples=1).fit(dist_matrix_meters) - base_clusters = self.base_model.labels_ - - self.train_df.loc[:, - f'{self.loc_type}_base_cluster_idx'] = base_clusters - - ######################## - ### get sub-clusters ### - ######################## - # copy base cluster column into final cluster column - self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = self.train_df[ - f'{self.loc_type}_base_cluster_idx'] - - if self.svm: - c = 0 # count of how many clusters we have iterated over - - # iterate over all clusters and subdivide them with SVM. the while - # loop is so we can do multiple iterations of subdividing if needed - while c < self.train_df[f'{self.loc_type}_cluster_idx'].max(): - points_in_cluster = self.train_df[ - self.train_df[f'{self.loc_type}_cluster_idx'] == c] - - # only do SVM if we have the minimum num of trips in the cluster - if len(points_in_cluster) < self.size_thresh: - c += 1 - continue - - # only do SVM if purity is below threshold - purity = single_cluster_purity(points_in_cluster, - label_col='purpose_true') - if purity < self.purity_thresh: - X = points_in_cluster[[ - f"{self.loc_type}_lon", f"{self.loc_type}_lat" - ]] - y = points_in_cluster.purpose_true.to_list() - - svm_model = make_pipeline( - StandardScaler(), - svm.SVC( - kernel='rbf', - gamma=self.gamma, - C=self.C, - )).fit(X, y) - labels = svm_model.predict(X) - unique_labels = np.unique(labels) - - # if the SVM predicts that all points in the cluster have - # the same label, just ignore it and don't reindex. - # this also helps us to handle the possibility that a - # cluster may be impure but inherently inseparable, e.g. an - # end cluster at a user's home, containing 50% trips from - # work to home and 50% round trips that start and end at - # home. we don't want to reindex otherwise the low purity - # will trigger SVM again, and we will attempt & fail to - # split the cluster ad infinitum - if len(unique_labels) > 1: - # map purpose labels to new cluster indices - # we offset indices by the max existing index so that we - # don't run into any duplicate indices - max_existing_idx = self.train_df[ - f'{self.loc_type}_cluster_idx'].max() - label_to_cluster = { - unique_labels[i]: i + max_existing_idx + 1 - for i in range(len(unique_labels)) - } - # update trips with their new cluster indices - indices = np.array( - [label_to_cluster[l] for l in labels]) - self.train_df.loc[ - self.train_df[f'{self.loc_type}_cluster_idx'] == c, - f'{self.loc_type}_cluster_idx'] = indices - - c += 1 - # TODO: make things categorical at the end? or maybe at the start of the decision tree pipeline - - return self - - def fit_predict(self, train_df): - """ Override to avoid unnecessarily computation of distance matrices. - """ - self.fit(train_df) - return self.train_df[[f'{self.loc_type}_cluster_idx']] - - def predict(self, test_df): - logging.info("PERF: Predicting DBSCANSVMCluster") - # TODO: store clusters as polygons so the prediction is faster - # TODO: we probably don't want to store test_df in self to be more memory-efficient - self.test_df = self._clean_data(test_df) - pred_clusters = self._NN_predict(self.test_df) - - self.test_df.loc[:, f'{self.loc_type}_cluster_idx'] = pred_clusters - - return self.test_df[[f'{self.loc_type}_cluster_idx']] - - def _NN_predict(self, test_df): - """ Generate base-cluster predictions for the test data using a - nearest-neighbor approach. - - sklearn doesn't implement predict() for DBSCAN, which is why we - need a custom method. - """ - logging.info("PERF: NN_predicting DBSCANSVMCluster") - n_samples = test_df.shape[0] - labels = np.ones(shape=n_samples, dtype=int) * -1 - - # get coordinates of core points (we can't use model.components_ - # because our input feature was a distance matrix and doesn't contain - # info about the raw coordinates) - # NOTE: technically, every single point in a cluster is a core point - # because it has at least minPts (2) points, including itself, in its - # radius - train_coordinates = self.train_df[[ - f'{self.loc_type}_lat', f'{self.loc_type}_lon' - ]] - train_radians = np.radians(train_coordinates) - - for idx, row in test_df.reset_index(drop=True).iterrows(): - # calculate the distances between the ith test data and all points, - # then find the index of the closest point. if the ith test data is - # within epsilon of the point, then assign its cluster to the ith - # test data (otherwise, leave it as -1, indicating noise). - # unfortunately, pairwise_distances_argmin() does not support - # haversine distance, so we have to reimplement it ourselves - new_loc_radians = np.radians( - row[[self.loc_type + "_lat", self.loc_type + "_lon"]].to_list()) - new_loc_radians = np.reshape(new_loc_radians, (1, 2)) - dist_matrix_meters = haversine_distances( - new_loc_radians, train_radians) * EARTH_RADIUS - - shortest_dist_idx = np.argmin(dist_matrix_meters) - if dist_matrix_meters[0, shortest_dist_idx] < self.radius: - labels[idx] = self.train_df.reset_index( - drop=True).loc[shortest_dist_idx, - f'{self.loc_type}_cluster_idx'] - - return labels - - -###################### -## trip classifiers ## -###################### - - -class NaiveBinningClassifier(TripClassifier): - """ Trip classifier using the existing Similarity class and associated - functions without refactoring them. Essentially a wrapper for the - existing code on e-mission-server. - - Args: - radius (int): maximum distance between any two points in the same - cluster - """ - - def __init__(self, radius=500): - logging.info("PERF: Initializing NaiveBinningClassifier") - self.radius = radius - - def set_params(self, params): - if 'radius' in params.keys(): self.radius = params['radius'] - - return self - - def fit(self, train_df,ct_entry=None): - logging.info("PERF: Fitting NaiveBinningClassifier") - # (copied from bsm.build_user_model()) - - # convert train_df to a list because the existing binning algorithm - # only accepts lists of Entry objects - train_trips = self._trip_df_to_list(train_df) - - - model_config = { - "metric": "od_similarity", - "similarity_threshold_meters": self.radius, # meters, - "apply_cutoff": False, - "clustering_way": "origin-destination", #cause thats what is set in performance_eval.py for this model - "incremental_evaluation": False - } - - sim_model = eamtg.GreedySimilarityBinning(model_config) - sim_model.fit(train_trips) - # set instance variables so we can access results later as well - self.sim = sim_model - self.bins = sim_model.bins - - # save all user labels - user_id = train_df.user_id.iloc[0] - model_type=eamumt.ModelType.GREEDY_SIMILARITY_BINNING - model_storage=eamums.ModelStorage.DOCUMENT_DATABASE - model_data_next=sim_model.to_dict() - last_done_ts = eamur._latest_timestamp(train_trips) - eamums.save_model(user_id, model_type, model_data_next, last_done_ts, model_storage) - - return self - - def predict_proba(self, test_df): - """ NOTE: these class probabilities have the confidence-discounting - heuristic applied. - """ - # convert test_df to a list because the existing binning algorithm - # only accepts lists of Entry objects - logging.info("PERF: Predicting NaiveBinningClassifier") - test_trips = self._trip_df_to_list(test_df) - - purpose_distribs = [] - mode_distribs = [] - replaced_distribs = [] - - for trip in test_trips: - trip_prediction = predict_cluster_confidence_discounting(trip) - - if len(trip_prediction) == 0: - # model could not find cluster for the trip - purpose_distribs += [{}] - mode_distribs += [{}] - replaced_distribs += [{}] - - else: - trip_prediction_df = pd.DataFrame(trip_prediction).rename( - columns={'labels': 'user_input'}) - # renaming is simply so we can use the expand_userinputs - # function - - expand_prediction = esdtq.expand_userinputs(trip_prediction_df) - # converts the 'labels' dictionaries into individual columns - - # sum up probability for each label - for label_type, label_distribs in zip( - ['purpose_confirm', 'mode_confirm', 'replaced_mode'], - [purpose_distribs, mode_distribs, replaced_distribs]): - label_distrib = {} - if label_type in expand_prediction.columns: - for label in expand_prediction[label_type].unique(): - label_distrib[label] = expand_prediction.loc[ - expand_prediction[label_type] == label, - 'p'].sum() - label_distribs += [label_distrib] - - proba_dfs = [] - for label_type, label_distribs in zip( - ['purpose', 'mode', 'replaced'], - [purpose_distribs, mode_distribs, replaced_distribs]): - - proba = pd.DataFrame(label_distribs) - proba['clusterable'] = proba.sum(axis=1) > 0 - proba['top_pred'] = proba.drop(columns=['clusterable']).idxmax( - axis=1) - proba['top_proba'] = proba.drop( - columns=['clusterable', 'top_pred']).max(axis=1, skipna=True) - classes = proba.columns[:-3] - proba.loc[:, classes] = proba.loc[:, classes].fillna(0) - proba = pd.concat([proba], keys=[label_type], axis=1) - proba_dfs += [proba] - - self.proba_df = pd.concat(proba_dfs, axis=1) - return self.proba_df - - def _trip_df_to_list(self, trip_df): - """ Converts a dataframe of trips into a list of trip Entry objects. - - Allows this class to accept DataFrames (which are used by the new - clustering algorithms) without having to refactor the old - clustering algorithm. - - Args: - trip_df: DataFrame containing trips. See code below for the - expected columns. - - """ - trips_list = [] - - for idx, row in trip_df.iterrows(): - data = { - 'source': row['source'], - 'end_ts': row['end_ts'], - # 'end_local_dt':row['end_local_dt'], # this attribute doesn't seem to appear in the dataframes I've tested with - 'end_fmt_time': row['end_fmt_time'], - 'end_loc': row['end_loc'], - 'raw_trip': row['raw_trip'], - 'start_ts': row['start_ts'], - # 'start_local_dt':row['start_local_dt'], # this attribute doesn't seem to appear in the dataframes I've tested with - 'start_fmt_time': row['start_fmt_time'], - 'start_loc': row['start_loc'], - 'duration': row['duration'], - 'distance': row['distance'], - 'start_place': row['start_place'], - 'end_place': row['end_place'], - 'cleaned_trip': row['cleaned_trip'], - 'inferred_labels': row['inferred_labels'], - 'inferred_trip': row['inferred_trip'], - 'expectation': row['expectation'], - 'confidence_threshold': row['confidence_threshold'], - 'expected_trip': row['expected_trip'], - 'user_input': row['user_input'] - } - trip = ecwe.Entry.create_entry(user_id=row['user_id'], - key='analysis/confirmed_trip', - data=data) - trips_list += [trip] - - return trips_list - - -class ClusterExtrapolationClassifier(TripClassifier): - """ Classifier that extrapolates labels from a trip's cluster. - - Args: - alg (str): clustering algorithm to use; either 'DBSCAN' or 'naive' - radius (int): radius for the clustering algorithm - svm (bool): whether or not to use SVM sub-clustering. (only when - alg=='DBSCAN') - size_thresh (int): the min number of trips a cluster must have - to be considered for SVM sub-division - purity_thresh (float): the min purity a cluster must have - to be sub-divided using SVM - gamma (float): coefficient for the rbf kernel in SVM - C (float): regularization hyperparameter for SVM - cluster_method (str): 'end', 'trip', 'combination'. whether to - extrapolate labels from only end clusters, only trip clusters, - or both end and trip clusters when available. - """ - - def __init__( - self, - alg='DBSCAN', - radius=100, # TODO: add diff start and end radii - svm=True, - size_thresh=1, - purity_thresh=1.0, - gamma=0.05, - C=1, - cluster_method='end'): - assert cluster_method in ['end', 'trip', 'combination'] - assert alg in ['DBSCAN', 'naive'] - self.alg = alg - self.radius = radius - self.svm = svm - self.size_thresh = size_thresh - self.purity_thresh = purity_thresh - self.gamma = gamma - self.C = C - self.cluster_method = cluster_method - - if self.alg == 'DBSCAN': - self.end_cluster_model = DBSCANSVMCluster( - loc_type='end', - radius=self.radius, - svm=self.svm, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - elif self.alg == 'naive': - self.end_cluster_model = RefactoredNaiveCluster(loc_type='end', - radius=self.radius) - - if self.cluster_method in ['trip', 'combination']: - if self.alg == 'DBSCAN': - self.start_cluster_model = DBSCANSVMCluster( - loc_type='start', - radius=self.radius, - svm=self.svm, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - elif self.alg == 'naive': - self.start_cluster_model = RefactoredNaiveCluster( - loc_type='start', radius=self.radius) - - self.trip_grouper = TripGrouper( - start_cluster_col='start_cluster_idx', - end_cluster_col='end_cluster_idx') - - def set_params(self, params): - """ hacky code that mimics the set_params of an sklearn Estimator class - so that we can pass params during randomizedsearchCV - - Args: - params (dict): a dictionary where the keys are the parameter - names and the values are the parameter values - """ - alg = params['alg'] if 'alg' in params.keys() else self.alg - radius = params['radius'] if 'radius' in params.keys() else self.radius - svm = params['svm'] if 'svm' in params.keys() else self.svm - size_thresh = params['size_thresh'] if 'size_thresh' in params.keys( - ) else self.size_thresh - purity_thresh = params[ - 'purity_thresh'] if 'purity_thresh' in params.keys( - ) else self.purity_thresh - gamma = params['gamma'] if 'gamma' in params.keys() else self.gamma - C = params['C'] if 'C' in params.keys() else self.C - cluster_method = params[ - 'cluster_method'] if 'cluster_method' in params.keys( - ) else self.cluster_method - - # calling __init__ again is not good practice, I know... - self.__init__(alg, radius, svm, size_thresh, purity_thresh, gamma, C, - cluster_method) - - return self - - def fit(self, train_df,ct_entry=None): - # fit clustering model - self.end_cluster_model.fit(train_df,ct_entry) - self.train_df = self.end_cluster_model.train_df - - if self.cluster_method in ['trip', 'combination']: - self.start_cluster_model.fit(train_df,ct_entry) - self.train_df.loc[:, ['start_cluster_idx' - ]] = self.start_cluster_model.train_df[[ - 'start_cluster_idx' - ]] - - # create trip-level clusters - trip_cluster_idx = self.trip_grouper.fit_transform(self.train_df) - self.train_df.loc[:, 'trip_cluster_idx'] = trip_cluster_idx - - return self - - def predict_proba(self, test_df): - """ NOTE: these class probabilities do NOT have a - confidence-discounting heuristic applied. - """ - self.end_cluster_model.predict(test_df) - # store a copy of test_df for now (TODO: make this more efficient since - # the data is duplicated) - self.test_df = self.end_cluster_model.test_df - - if self.cluster_method in ['trip', 'combination']: - self.start_cluster_model.predict(test_df) - # append the start cluster indices - self.test_df.loc[:, [ - 'start_cluster_idx' - ]] = self.start_cluster_model.test_df.loc[:, ['start_cluster_idx']] - - # create trip-level clusters - trip_cluster_idx = self.trip_grouper.transform(self.test_df) - self.test_df.loc[:, 'trip_cluster_idx'] = trip_cluster_idx - - # extrapolate label distributions from cluster information - self.test_df.loc[:, [ - 'mode_distrib', 'purpose_distrib', 'replaced_distrib' - ]] = np.nan - - if self.cluster_method in ['end', 'trip']: - cluster_col = f'{self.cluster_method}_cluster_idx' - self.test_df = self._add_label_distributions( - self.test_df, cluster_col) - - else: # self.cluster_method == 'combination' - # try to get label distributions from trip-level clusters first, - # because trip-level clusters tend to be more homogenous and will - # yield more accurate predictions - self.test_df = self._add_label_distributions( - self.test_df, 'trip_cluster_idx') - - # for trips that have an empty label-distribution after the first - # pass using trip clusters, try to get a distribution from the - # destination cluster (this includes both trips that *don't* fall - # into a trip cluster, as well as trips that *do* fall into a trip - # cluster but are missing some/all categories of labels due to - # missing user inputs.) - - # fill in missing label-distributions by the label_type - # (we want to iterate by label_type rather than check cluster idx - # because it's possible that some trips in a trip-cluster have - # predictions for one label_type but not another) - for label_type in ['mode', 'purpose', 'replaced']: - self.test_df.loc[self.test_df[f'{label_type}_distrib'] == - {}] = self._add_label_distributions( - self.test_df.loc[ - self.test_df[f'{label_type}_distrib'] - == {}], - 'end_cluster_idx', - label_types=[label_type]) - - # create the dataframe of probabilities - proba_dfs = [] - for label_type in ['purpose', 'mode', 'replaced']: - classes = self.train_df[f'{label_type}_true'].dropna().unique() - proba = pd.DataFrame( - self.test_df[f'{label_type}_distrib'].to_list(), - columns=classes) - proba['top_pred'] = proba.idxmax(axis=1) - proba['top_proba'] = proba.max(axis=1, skipna=True) - proba['clusterable'] = self.test_df.end_cluster_idx >= 0 - proba.loc[:, classes] = proba.loc[:, classes].fillna(0) - proba = pd.concat([proba], keys=[label_type], axis=1) - proba_dfs += [proba] - - self.proba_df = pd.concat(proba_dfs, axis=1) - return self.proba_df - - def _add_label_distributions(self, - df, - cluster_col, - label_types=['mode', 'purpose', 'replaced']): - """ Add label distributions to a DataFrame. - - Args: - df (DataFrame): DataFrame containing a column of clusters - cluster_col (str): name of column in df containing clusters - label_types (str list): the categories of labels to retrieve - distributions for. - - Returns: - a DataFrame with additional columns in which the entries are - dictionaries containing label distributions. - """ - df = df.copy() # to avoid SettingWithCopyWarning - for c in df.loc[:, cluster_col].unique(): - labeled_trips_in_cluster = self.train_df.loc[ - self.train_df[cluster_col] == c] - unlabeled_trips_in_cluster = df.loc[df[cluster_col] == c] - - cluster_size = len(unlabeled_trips_in_cluster) - - for label_type in label_types: - assert label_type in ['mode', 'purpose', 'replaced'] - - # get distribution of label_type labels in this cluster - distrib = labeled_trips_in_cluster[ - f'{label_type}_true'].value_counts(normalize=True, - dropna=True).to_dict() - # TODO: add confidence discounting - - # update predictions - # convert the dict into a list of dicts to work around pandas - # thinking we're trying to insert information according to a - # key-value map - # TODO: this is the line throwing the set on slice warning - df.loc[df[cluster_col] == c, - f'{label_type}_distrib'] = [distrib] * cluster_size - - return df - - -class EnsembleClassifier(TripClassifier, metaclass=ABCMeta): - """ Template class for trip classifiers using ensemble algorithms. - - Required args: - loc_feature (str): 'coordinates' or 'cluster' - """ - base_features = [ - 'duration', - 'distance', - 'start_local_dt_year', - 'start_local_dt_month', - 'start_local_dt_day', - 'start_local_dt_hour', - # 'start_local_dt_minute', - 'start_local_dt_weekday', - 'end_local_dt_year', # most likely the same as the start year - 'end_local_dt_month', # most likely the same as the start month - 'end_local_dt_day', - 'end_local_dt_hour', - # 'end_local_dt_minute', - 'end_local_dt_weekday', - ] - targets = ['mode_true', 'purpose_true', 'replaced_true'] - - # required instance attributes - loc_feature = NotImplemented - purpose_enc = NotImplemented - mode_enc = NotImplemented - purpose_predictor = NotImplemented - mode_predictor = NotImplemented - replaced_predictor = NotImplemented - - # required methods - def fit(self, train_df,ct_entry=None): - # get location features - if self.loc_feature == 'cluster': - # fit clustering model(s) and one-hot encode their indices - # TODO: consolidate start/end_cluster_model in a single instance - # that has a location_type parameter in the fit() method - self.end_cluster_model.fit(train_df) - - clusters_to_encode = self.end_cluster_model.train_df[[ - 'end_cluster_idx' - ]].copy() # copy is to avoid SettingWithCopyWarning - - if self.use_start_clusters or self.use_trip_clusters: - self.start_cluster_model.fit(train_df) - - if self.use_start_clusters: - clusters_to_encode = pd.concat([ - clusters_to_encode, - self.start_cluster_model.train_df[['start_cluster_idx']] - ], - axis=1) - if self.use_trip_clusters: - start_end_clusters = pd.concat([ - self.end_cluster_model.train_df[['end_cluster_idx']], - self.start_cluster_model.train_df[['start_cluster_idx']] - ], - axis=1) - trip_cluster_idx = self.trip_grouper.fit_transform( - start_end_clusters) - clusters_to_encode.loc[:, - 'trip_cluster_idx'] = trip_cluster_idx - - loc_features_df = self.cluster_enc.fit_transform( - clusters_to_encode.astype(int)) - - # clean the df again because we need it in the next step - # TODO: remove redundancy - self.train_df = self._clean_data(train_df) - - # TODO: move below code into a reusable function - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - else: # self.loc_feature == 'coordinates' - self.train_df = self._clean_data(train_df) - - # TODO: move below code into a reusable function - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - loc_features_df = self.train_df[[ - 'start_lon', 'start_lat', 'end_lon', 'end_lat' - ]] - - # prepare data for the ensemble classifiers - - # note that we want to use purpose data to aid our mode predictions, - # and use both purpose and mode data to aid our replaced-mode - # predictions - # thus, we want to one-hot encode the purpose and mode as data - # features, but also preserve an unencoded copy for the target columns - - # dataframe holding all features and targets - self.Xy_train = pd.concat( - [self.train_df[self.base_features + self.targets], loc_features_df], - axis=1) - - # encode purposes and modes - onehot_purpose_df = self.purpose_enc.fit_transform( - self.Xy_train[['purpose_true']], output_col_prefix='purpose') - onehot_mode_df = self.mode_enc.fit_transform( - self.Xy_train[['mode_true']], output_col_prefix='mode') - self.Xy_train = pd.concat( - [self.Xy_train, onehot_purpose_df, onehot_mode_df], axis=1) - - # for predicting purpose, drop encoded purpose and mode features, as - # well as all target labels - self.X_purpose = self.Xy_train.dropna(subset=['purpose_true']).drop( - labels=self.targets + self.purpose_enc.onehot_encoding_cols + - self.mode_enc.onehot_encoding_cols, - axis=1) - - # for predicting mode, we want to keep purpose data - self.X_mode = self.Xy_train.dropna(subset=['mode_true']).drop( - labels=self.targets + self.mode_enc.onehot_encoding_cols, axis=1) - - # for predicting replaced-mode, we want to keep purpose and mode data - self.X_replaced = self.Xy_train.dropna(subset=['replaced_true']).drop( - labels=self.targets, axis=1) - - self.y_purpose = self.Xy_train['purpose_true'].dropna() - self.y_mode = self.Xy_train['mode_true'].dropna() - self.y_replaced = self.Xy_train['replaced_true'].dropna() - - # fit classifiers - if len(self.X_purpose) > 0: - self.purpose_predictor.fit(self.X_purpose, self.y_purpose) - if len(self.X_mode) > 0: - self.mode_predictor.fit(self.X_mode, self.y_mode) - if len(self.X_replaced) > 0: - self.replaced_predictor.fit(self.X_replaced, self.y_replaced) - - return self - - def predict_proba(self, test_df): - """ NOTE: these class probabilities do NOT have a - confidence-discounting heuristic applied. - """ - ################ - ### get data ### - ################ - self.X_test_for_purpose = self._get_X_test_for_purpose(test_df) - - ######################## - ### make predictions ### - ######################## - # note that we want to use purpose data to aid our mode predictions, - # and use both purpose and mode data to aid our replaced-mode - # predictions - - # TODO: some of the code across the try and except blocks can be - # consolidated by considering one-hot encoding fully np.nan arrays - try: - purpose_proba_raw = self.purpose_predictor.predict_proba( - self.X_test_for_purpose) - purpose_proba = pd.DataFrame( - purpose_proba_raw, columns=self.purpose_predictor.classes_) - purpose_pred = purpose_proba.idxmax(axis=1) - - # update X_test with one-hot-encoded purpose predictions to aid - # mode predictor - # TODO: converting purpose_pred to a DataFrame feels super - # unnecessary, make this more efficient - onehot_purpose_df = self.purpose_enc.transform( - pd.DataFrame(purpose_pred).set_index( - self.X_test_for_purpose.index)) - self.X_test_for_mode = pd.concat( - [self.X_test_for_purpose, onehot_purpose_df], axis=1) - - mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() - - except NotFittedError as e: - # if we can't predict purpose, we can still try to predict mode and - # replaced-mode without one-hot encoding the purpose - - purpose_pred = np.full((len(self.X_test_for_purpose), ), np.nan) - purpose_proba_raw = np.full((len(self.X_test_for_purpose), 1), 0) - purpose_proba = pd.DataFrame(purpose_proba_raw, columns=[np.nan]) - - self.X_test_for_mode = self.X_test_for_purpose - mode_proba, replaced_proba = self._try_predict_proba_mode_replaced() - - mode_pred = mode_proba.idxmax(axis=1) - replaced_pred = replaced_proba.idxmax(axis=1) - - if (purpose_pred.dtype == np.float64 and mode_pred.dtype == np.float64 - and replaced_pred.dtype == np.float64): - # this indicates that all the predictions are np.nan so none of the - # random forest classifiers were fitted - raise NotFittedError - - # TODO: move this to a Mixin for cluster-based predictors and use the - # 'cluster' column of the proba_df outputs - # if self.drop_unclustered: - # # TODO: actually, we should only drop purpose predictions. we can - # # then impute the missing entries in the purpose feature and still - # # try to predict mode and replaced-mode without it - # self.predictions.loc[ - # self.end_cluster_model.test_df['end_cluster_idx'] == -1, - # ['purpose_pred', 'mode_pred', 'replaced_pred']] = np.nan - - proba_dfs = [] - for label_type, proba in zip( - ['purpose', 'mode', 'replaced'], - [purpose_proba, mode_proba, replaced_proba]): - proba['top_pred'] = proba.idxmax(axis=1) - proba['top_proba'] = proba.max(axis=1, skipna=True) - proba['clusterable'] = self._clusterable( - self.X_test_for_purpose).astype(bool) - proba = pd.concat([proba], keys=[label_type], axis=1) - proba_dfs += [proba] - - self.proba_df = pd.concat(proba_dfs, axis=1) - return self.proba_df - - def _get_X_test_for_purpose(self, test_df): - """ Do the pre-processing to get data that we can then pass into the - ensemble classifiers. - """ - if self.loc_feature == 'cluster': - # get clusters - self.end_cluster_model.predict(test_df) - clusters_to_encode = self.end_cluster_model.test_df[[ - 'end_cluster_idx' - ]].copy() # copy is to avoid SettingWithCopyWarning - - if self.use_start_clusters or self.use_trip_clusters: - self.start_cluster_model.predict(test_df) - - if self.use_start_clusters: - clusters_to_encode = pd.concat([ - clusters_to_encode, - self.start_cluster_model.test_df[['start_cluster_idx']] - ], - axis=1) - if self.use_trip_clusters: - start_end_clusters = pd.concat([ - self.end_cluster_model.test_df[['end_cluster_idx']], - self.start_cluster_model.test_df[['start_cluster_idx']] - ], - axis=1) - trip_cluster_idx = self.trip_grouper.transform( - start_end_clusters) - clusters_to_encode.loc[:, - 'trip_cluster_idx'] = trip_cluster_idx - - # one-hot encode the cluster indices - loc_features_df = self.cluster_enc.transform(clusters_to_encode) - else: # self.loc_feature == 'coordinates' - test_df = self._clean_data(test_df) - loc_features_df = test_df[[ - 'start_lon', 'start_lat', 'end_lon', 'end_lat' - ]] - - # extract the desired data - X_test = pd.concat([ - test_df[self.base_features].reset_index(drop=True), - loc_features_df.reset_index(drop=True) - ], - axis=1) - - return X_test - - def _try_predict_proba_mode_replaced(self): - """ Try to predict mode and replaced-mode. Handles error in case the - ensemble algorithms were not fitted. - - Requires self.X_test_for_mode to have already been set. (These are - the DataFrames containing the test data to be passed into self. - mode_predictor.) - - Returns: mode_proba and replaced_proba, two DataFrames containing - class probabilities for mode and replaced-mode respectively - """ - - try: - # predict mode - mode_proba_raw = self.mode_predictor.predict_proba( - self.X_test_for_mode) - mode_proba = pd.DataFrame(mode_proba_raw, - columns=self.mode_predictor.classes_) - mode_pred = mode_proba.idxmax(axis=1) - - # update X_test with one-hot-encoded mode predictions to aid - # replaced-mode predictor - onehot_mode_df = self.mode_enc.transform( - pd.DataFrame(mode_pred).set_index(self.X_test_for_mode.index)) - self.X_test_for_replaced = pd.concat( - [self.X_test_for_mode, onehot_mode_df], axis=1) - replaced_proba = self._try_predict_proba_replaced() - - except NotFittedError as e: - mode_proba_raw = np.full((len(self.X_test_for_mode), 1), 0) - mode_proba = pd.DataFrame(mode_proba_raw, columns=[np.nan]) - - # if we don't have mode predictions, we *could* still try to - # predict replaced mode (but if the user didn't input mode labels - # then it's unlikely they would input replaced-mode) - self.X_test_for_replaced = self.X_test_for_mode - replaced_proba = self._try_predict_proba_replaced() - - return mode_proba, replaced_proba - - def _try_predict_proba_replaced(self): - """ Try to predict replaced mode. Handles error in case the - replaced_predictor was not fitted. - - Requires self.X_test_for_replaced to have already been set. (This - is the DataFrame containing the test data to be passed into self. - replaced_predictor.) - - Returns: replaced_proba, DataFrame containing class probabilities - for replaced-mode - """ - try: - replaced_proba_raw = self.replaced_predictor.predict_proba( - self.X_test_for_replaced - ) # has shape (len_trips, number of replaced_mode classes) - replaced_proba = pd.DataFrame( - replaced_proba_raw, columns=self.replaced_predictor.classes_) - - except NotFittedError as e: - replaced_proba_raw = np.full((len(self.X_test_for_replaced), 1), 0) - replaced_proba = pd.DataFrame(replaced_proba_raw, columns=[np.nan]) - - return replaced_proba - - def _clusterable(self, test_df): - """ Check if the end points can be clustered (i.e. are within - meters of an end point from the training set) - """ - if self.loc_feature == 'cluster': - return self.end_cluster_model.test_df.end_cluster_idx >= 0 - - n_samples = test_df.shape[0] - clustered = np.ones(shape=n_samples, dtype=int) * False - - train_coordinates = self.train_df[['end_lat', 'end_lon']] - train_radians = np.radians(train_coordinates) - - for idx, row in test_df.reset_index(drop=True).iterrows(): - # calculate the distances between the ith test data and all points, - # then find the minimum distance for each point and check if it's - # within the distance threshold. - # unfortunately, pairwise_distances_argmin() does not support - # haversine distance, so we have to reimplement it ourselves - new_loc_radians = np.radians(row[["end_lat", "end_lon"]].to_list()) - new_loc_radians = np.reshape(new_loc_radians, (1, 2)) - dist_matrix_meters = haversine_distances( - new_loc_radians, train_radians) * EARTH_RADIUS - - shortest_dist = np.min(dist_matrix_meters) - if shortest_dist < self.radius: - clustered[idx] = True - - return clustered - - -class ForestClassifier(EnsembleClassifier): - """ Random forest-based trip classifier. - - Args: - loc_feature (str): 'coordinates' or 'cluster'; whether to use lat/ - lon coordinates or cluster indices for the location feature - radius (int): radius for DBSCAN clustering. only if - loc_feature=='cluster' - size_thresh (int): the min number of trips a cluster must have to - be considered for sub-division via SVM. only if - loc_feature=='cluster' - purity_thresh (float): the min purity a cluster must have to be - sub-divided via SVM. only if loc_feature=='cluster' - gamma (float): coefficient for the rbf kernel in SVM. only if - loc_feature=='cluster' - C (float): regularization hyperparameter for SVM. only if - loc_feature=='cluster' - n_estimators (int): number of estimators in the random forest - criterion (str): function to measure the quality of a split in the - random forest - max_depth (int): max depth of a tree in the random forest. - unlimited if None. - min_samples_split (int): min number of samples required to split an - internal node in a decision tree - min_samples_leaf (int): min number of samples required for a leaf - node in a decision tree - max_features (str): number of features to consider when looking for - the best split in a decision tree - bootstrap (bool): whether bootstrap samples are used when building - decision trees - random_state (int): random state for deterministic random forest - construction - use_start_clusters (bool): whether or not to use start clusters as - input features to the ensemble classifier. only if - loc_feature=='cluster' - use_trip_clusters (bool): whether or not to use trip-level clusters - as input features to the ensemble classifier. only if - loc_feature=='cluster' - """ - - def __init__( - self, - loc_feature='coordinates', - radius=100, # TODO: add different start and end radii - size_thresh=1, - purity_thresh=1.0, - gamma=0.05, - C=1, - n_estimators=100, - criterion='gini', - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - max_features='sqrt', - bootstrap=True, - random_state=42, - # drop_unclustered=False, - use_start_clusters=False, - use_trip_clusters=True): - assert loc_feature in ['cluster', 'coordinates'] - self.loc_feature = loc_feature - self.radius = radius - self.size_thresh = size_thresh - self.purity_thresh = purity_thresh - self.gamma = gamma - self.C = C - self.n_estimators = n_estimators - self.criterion = criterion - self.max_depth = max_depth - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - self.max_features = max_features - self.bootstrap = bootstrap - self.random_state = random_state - # self.drop_unclustered = drop_unclustered - self.use_start_clusters = use_start_clusters - self.use_trip_clusters = use_trip_clusters - - if self.loc_feature == 'cluster': - # clustering algorithm to generate end clusters - self.end_cluster_model = DBSCANSVMCluster( - loc_type='end', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_start_clusters or self.use_trip_clusters: - # clustering algorithm to generate start clusters - self.start_cluster_model = DBSCANSVMCluster( - loc_type='start', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_trip_clusters: - # helper class to generate trip-level clusters - self.trip_grouper = TripGrouper( - start_cluster_col='start_cluster_idx', - end_cluster_col='end_cluster_idx') - - # wrapper class to generate one-hot encodings for cluster indices - self.cluster_enc = OneHotWrapper(sparse=False, - handle_unknown='ignore') - - # wrapper class to generate one-hot encodings for purposes and modes - self.purpose_enc = OneHotWrapper(impute_missing=True, - sparse=False, - handle_unknown='error') - self.mode_enc = OneHotWrapper(impute_missing=True, - sparse=False, - handle_unknown='error') - - # ensemble classifiers for each label category - self.purpose_predictor = RandomForestClassifier( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - bootstrap=self.bootstrap, - random_state=self.random_state) - self.mode_predictor = RandomForestClassifier( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - bootstrap=self.bootstrap, - random_state=self.random_state) - self.replaced_predictor = RandomForestClassifier( - n_estimators=self.n_estimators, - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - bootstrap=self.bootstrap, - random_state=self.random_state) - - def set_params(self, params): - """ hacky code that mimics the set_params of an sklearn Estimator class - so that we can pass params during randomizedsearchCV - - Args: - params (dict): a dictionary where the keys are the parameter - names and the values are the parameter values - """ - loc_feature = params['loc_feature'] if 'loc_feature' in params.keys( - ) else self.loc_feature - radius = params['radius'] if 'radius' in params.keys() else self.radius - size_thresh = params['size_thresh'] if 'size_thresh' in params.keys( - ) else self.size_thresh - purity_thresh = params[ - 'purity_thresh'] if 'purity_thresh' in params.keys( - ) else self.purity_thresh - gamma = params['gamma'] if 'gamma' in params.keys() else self.gamma - C = params['C'] if 'C' in params.keys() else self.C - n_estimators = params['n_estimators'] if 'n_estimators' in params.keys( - ) else self.n_estimators - criterion = params['criterion'] if 'criterion' in params.keys( - ) else self.criterion - max_depth = params['max_depth'] if 'max_depth' in params.keys( - ) else self.max_depth - min_samples_split = params[ - 'min_samples_split'] if 'min_samples_split' in params.keys( - ) else self.min_samples_split - min_samples_leaf = params[ - 'min_samples_leaf'] if 'min_samples_leaf' in params.keys( - ) else self.min_samples_leaf - max_features = params['max_features'] if 'max_features' in params.keys( - ) else self.max_features - bootstrap = params['bootstrap'] if 'bootstrap' in params.keys( - ) else self.bootstrap - random_state = params['random_state'] if 'random_state' in params.keys( - ) else self.random_state - use_start_clusters = params[ - 'use_start_clusters'] if 'use_start_clusters' in params.keys( - ) else self.use_start_clusters - # drop_unclustered = params[ - # 'drop_unclustered'] if 'drop_unclustered' in params.keys( - # ) else self.drop_unclustered - use_trip_clusters = params[ - 'use_trip_clusters'] if 'use_trip_clusters' in params.keys( - ) else self.use_trip_clusters - - # yes, calling __init__ again is not good practice... - self.__init__(loc_feature, radius, size_thresh, purity_thresh, gamma, C, - n_estimators, criterion, max_depth, min_samples_split, - min_samples_leaf, max_features, bootstrap, random_state, - use_start_clusters, use_trip_clusters) - return self - - -class ClusterForestSlimPredictor(ForestClassifier): - """ This is the same as ForestClassifier, just with fewer base - features. - - Args: - loc_feature (str): 'coordinates' or 'cluster'; whether to use lat/ - lon coordinates or cluster indices for the location feature - radius (int): radius for DBSCAN clustering. only if - loc_feature=='cluster' - size_thresh (int): the min number of trips a cluster must have to - be considered for sub-division via SVM. only if - loc_feature=='cluster' - purity_thresh (float): the min purity a cluster must have to be - sub-divided via SVM. only if loc_feature=='cluster' - gamma (float): coefficient for the rbf kernel in SVM. only if - loc_feature=='cluster' - C (float): regularization hyperparameter for SVM. only if - loc_feature=='cluster' - n_estimators (int): number of estimators in the random forest - criterion (str): function to measure the quality of a split in the - random forest - max_depth (int): max depth of a tree in the random forest. - unlimited if None. - min_samples_split (int): min number of samples required to split an - internal node in a decision tree - min_samples_leaf (int): min number of samples required for a leaf - node in a decision tree - max_features (str): number of features to consider when looking for - the best split in a decision tree - bootstrap (bool): whether bootstrap samples are used when building - decision trees - random_state (int): random state for deterministic random forest - construction - use_start_clusters (bool): whether or not to use start clusters as - input features to the ensemble classifier. only if - loc_feature=='cluster' - use_trip_clusters (bool): whether or not to use trip-level clusters - as input features to the ensemble classifier. only if - loc_feature=='cluster' - """ - - def __init__( - self, - loc_feature='coordinates', - radius=100, # TODO: add different start and end radii - size_thresh=1, - purity_thresh=1.0, - gamma=0.05, - C=1, - n_estimators=100, - criterion='gini', - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - max_features='sqrt', - bootstrap=True, - random_state=42, - # drop_unclustered=False, - use_start_clusters=False, - use_trip_clusters=True): - - super().__init__(loc_feature, radius, size_thresh, purity_thresh, gamma, - C, n_estimators, criterion, max_depth, - min_samples_split, min_samples_leaf, max_features, - bootstrap, random_state, use_start_clusters, - use_trip_clusters) - - self.base_features = [ - 'duration', - 'distance', - ] - - -class AdaBoostClassifier(EnsembleClassifier): - """ AdaBoost-based trip classifier. - - Args: - loc_feature (str): 'coordinates' or 'cluster'; whether to use lat/ - lon coordinates or cluster indices for the location feature - radius (int): radius for DBSCAN clustering. only if - loc_feature=='cluster' - size_thresh (int): the min number of trips a cluster must have to - be considered for sub-division via SVM. only if - loc_feature=='cluster' - purity_thresh (float): the min purity a cluster must have to be - sub-divided via SVM. only if loc_feature=='cluster' - gamma (float): coefficient for the rbf kernel in SVM. only if - loc_feature=='cluster' - C (float): regularization hyperparameter for SVM. only if - loc_feature=='cluster' - n_estimators (int): number of estimators - criterion (str): function to measure the quality of a split in a - decision tree - max_depth (int): max depth of a tree in the random forest. - unlimited if None. - min_samples_split (int): min number of samples required to split an - internal node in a decision tree - min_samples_leaf (int): min number of samples required for a leaf - node in a decision tree - max_features (str): number of features to consider when looking for - the best split in a decision tree - random_state (int): random state for deterministic random forest - construction - use_start_clusters (bool): whether or not to use start clusters as - input features to the ensemble classifier. only if - loc_feature=='cluster' - use_trip_clusters (bool): whether or not to use trip-level clusters - as input features to the ensemble classifier. only if - loc_feature=='cluster' - learning_rate (float): weight applied to each decision tree at each - boosting iteration - """ - - def __init__( - self, - loc_feature='coordinates', - radius=100, # TODO: add different start and end radii - size_thresh=1, - purity_thresh=1.0, - gamma=0.05, - C=1, - n_estimators=100, - criterion='gini', - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - max_features='sqrt', - random_state=42, - # drop_unclustered=False, - use_start_clusters=False, - use_trip_clusters=True, - use_base_clusters=True, - learning_rate=1.0): - assert loc_feature in ['cluster', 'coordinates'] - self.loc_feature = loc_feature - self.radius = radius - self.size_thresh = size_thresh - self.purity_thresh = purity_thresh - self.gamma = gamma - self.C = C - self.n_estimators = n_estimators - self.criterion = criterion - self.max_depth = max_depth - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - self.max_features = max_features - self.random_state = random_state - # self.drop_unclustered = drop_unclustered - self.use_start_clusters = use_start_clusters - self.use_trip_clusters = use_trip_clusters - self.use_base_clusters = use_base_clusters - self.learning_rate = learning_rate - - if self.loc_feature == 'cluster': - # clustering algorithm to generate end clusters - self.end_cluster_model = DBSCANSVMCluster( - loc_type='end', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_start_clusters or self.use_trip_clusters: - # clustering algorithm to generate start clusters - self.start_cluster_model = DBSCANSVMCluster( - loc_type='start', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_trip_clusters: - # helper class to generate trip-level clusters - self.trip_grouper = TripGrouper( - start_cluster_col='start_cluster_idx', - end_cluster_col='end_cluster_idx') - - # wrapper class to generate one-hot encodings for cluster indices - self.cluster_enc = OneHotWrapper(sparse=False, - handle_unknown='ignore') - - # wrapper class to generate one-hot encodings for purposes and modes - self.purpose_enc = OneHotWrapper(impute_missing=True, - sparse=False, - handle_unknown='error') - self.mode_enc = OneHotWrapper(impute_missing=True, - sparse=False, - handle_unknown='error') - - self.purpose_predictor = AdaBoostClassifier( - n_estimators=self.n_estimators, - learning_rate=self.learning_rate, - random_state=self.random_state, - base_estimator=DecisionTreeClassifier( - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - random_state=self.random_state)) - self.mode_predictor = AdaBoostClassifier( - n_estimators=self.n_estimators, - learning_rate=self.learning_rate, - random_state=self.random_state, - base_estimator=DecisionTreeClassifier( - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - random_state=self.random_state)) - self.replaced_predictor = AdaBoostClassifier( - n_estimators=self.n_estimators, - learning_rate=self.learning_rate, - random_state=self.random_state, - base_estimator=DecisionTreeClassifier( - criterion=self.criterion, - max_depth=self.max_depth, - min_samples_split=self.min_samples_split, - min_samples_leaf=self.min_samples_leaf, - max_features=self.max_features, - random_state=self.random_state)) - - def set_params(self, params): - """ hacky code that mimics the set_params of an sklearn Estimator class - so that we can pass params during randomizedsearchCV - - Args: - params (dict): a dictionary where the keys are the parameter - names and the values are the parameter values - """ - radius = params['radius'] if 'radius' in params.keys() else self.radius - size_thresh = params['size_thresh'] if 'size_thresh' in params.keys( - ) else self.size_thresh - purity_thresh = params[ - 'purity_thresh'] if 'purity_thresh' in params.keys( - ) else self.purity_thresh - gamma = params['gamma'] if 'gamma' in params.keys() else self.gamma - C = params['C'] if 'C' in params.keys() else self.C - n_estimators = params['n_estimators'] if 'n_estimators' in params.keys( - ) else self.n_estimators - criterion = params['criterion'] if 'criterion' in params.keys( - ) else self.criterion - max_depth = params['max_depth'] if 'max_depth' in params.keys( - ) else self.max_depth - min_samples_split = params[ - 'min_samples_split'] if 'min_samples_split' in params.keys( - ) else self.min_samples_split - min_samples_leaf = params[ - 'min_samples_leaf'] if 'min_samples_leaf' in params.keys( - ) else self.min_samples_leaf - max_features = params['max_features'] if 'max_features' in params.keys( - ) else self.max_features - random_state = params['random_state'] if 'random_state' in params.keys( - ) else self.random_state - use_start_clusters = params[ - 'use_start_clusters'] if 'use_start_clusters' in params.keys( - ) else self.use_start_clusters - # drop_unclustered = params[ - # 'drop_unclustered'] if 'drop_unclustered' in params.keys( - # ) else self.drop_unclustered - use_trip_clusters = params[ - 'use_trip_clusters'] if 'use_trip_clusters' in params.keys( - ) else self.use_trip_clusters - learning_rate = params[ - 'learning_rate'] if 'learning_rate' in params.keys( - ) else self.learning_rate - - # calling __init__ again is not good practice, I know... - self.__init__(radius, size_thresh, purity_thresh, gamma, C, - n_estimators, criterion, max_depth, min_samples_split, - min_samples_leaf, max_features, random_state, - use_start_clusters, use_trip_clusters, learning_rate) - return self - - -class TripGrouper(): - """ Helper class to get trip clusters from start and end clusters. - - Args: - start_cluster_col (str): name of the column containing start - cluster indices - end_cluster_col (str): name of the column containing end cluster - indices - """ - - def __init__(self, - start_cluster_col='start_cluster_idx', - end_cluster_col='end_cluster_idx'): - self.start_cluster_col = start_cluster_col - self.end_cluster_col = end_cluster_col - - def fit_transform(self, trip_df): - """ Fit and remember possible trip clusters. - - Args: - trip_df (DataFrame): DataFrame containing trips. must have - columns and - """ - trip_groups = trip_df.groupby( - [self.start_cluster_col, self.end_cluster_col]) - - # need dict so we can access the trip indices of all the trips in each - # group. the key is the group tuple and the value is the list of trip - # indices in the group. - self.trip_groups_dict = dict(trip_groups.groups) - - # we want to convert trip-group tuples to to trip-cluster indices, - # hence the pd Series - trip_groups_series = pd.Series(list(self.trip_groups_dict.keys())) - - trip_cluster_idx = np.empty(len(trip_df)) - - for group_idx in range(len(trip_groups_series)): - group_tuple = trip_groups_series[group_idx] - trip_idxs_in_group = self.trip_groups_dict[group_tuple] - trip_cluster_idx[trip_idxs_in_group] = group_idx - - return trip_cluster_idx - - def transform(self, new_trip_df): - """ Get trip clusters for a new set of trips. - - Args: - new_trip_df (DataFrame): DataFrame containing trips. must have - columns and - """ - prediction_trip_groups = new_trip_df.groupby( - [self.start_cluster_col, self.end_cluster_col]) - - # need dict so we can access the trip indices of all the trips in each - # group. the key is the group tuple and the value is the list of trip - # indices in the group. - prediction_trip_groups_dict = dict(prediction_trip_groups.groups) - trip_groups_series = pd.Series(list(self.trip_groups_dict.keys())) - trip_cluster_idx = np.empty(len(new_trip_df)) - - for group_tuple in dict(prediction_trip_groups.groups).keys(): - # check if the trip cluster exists in the training set - trip_idxs_in_group = prediction_trip_groups_dict[group_tuple] - if group_tuple in self.trip_groups_dict.keys(): - # look up the group index from the series we created when we - # fit the model - group_idx = trip_groups_series[trip_groups_series == - group_tuple].index[0] - else: - group_idx = -1 - - trip_cluster_idx[trip_idxs_in_group] = group_idx - - return trip_cluster_idx - - -class OneHotWrapper(): - """ Helper class to streamline one-hot encoding. - - Args: - impute_missing (bool): whether or not to impute np.nan values. - sparse (bool): whether or not to return a sparse matrix. - handle_unknown (str): specifies the way unknown categories are - handled during transform. - """ - - def __init__( - self, - impute_missing=False, - sparse=False, - handle_unknown='ignore', - ): - self.impute_missing = impute_missing - if self.impute_missing: - self.encoder = make_pipeline( - SimpleImputer(missing_values=np.nan, - strategy='constant', - fill_value='missing'), - OneHotEncoder(sparse=False, handle_unknown=handle_unknown)) - else: - self.encoder = OneHotEncoder(sparse=sparse, - handle_unknown=handle_unknown) - - def fit_transform(self, train_df, output_col_prefix=None): - """ Establish one-hot encoded variables. - - Args: - train_df (DataFrame): DataFrame containing train trips. - output_col_prefix (str): only if train_df is a single column - """ - # TODO: handle pd series - - train_df = train_df.copy() # to avoid SettingWithCopyWarning - - # if imputing, the dtype of each column must be string/object and not - # numerical, otherwise the SimpleImputer will fail - if self.impute_missing: - for col in train_df.columns: - train_df[col] = train_df[col].astype(object) - onehot_encoding = self.encoder.fit_transform(train_df) - self.onehot_encoding_cols_all = [] - for col in train_df.columns: - if train_df.shape[1] > 1 or output_col_prefix is None: - output_col_prefix = col - self.onehot_encoding_cols_all += [ - f'{output_col_prefix}_{val}' - for val in np.sort(train_df[col].dropna().unique()) - ] - # we handle np.nan separately because it is of type float, and may - # cause issues with np.sort if the rest of the unique values are - # strings - if any((train_df[col].isna())): - self.onehot_encoding_cols_all += [f'{output_col_prefix}_nan'] - - onehot_encoding_df = pd.DataFrame( - onehot_encoding, - columns=self.onehot_encoding_cols_all).set_index(train_df.index) - - # ignore the encoded columns for missing entries - self.onehot_encoding_cols = copy.deepcopy(self.onehot_encoding_cols_all) - for col in self.onehot_encoding_cols_all: - if col.endswith('_nan'): - onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) - self.onehot_encoding_cols.remove(col) - - return onehot_encoding_df.astype(int) - - def transform(self, test_df): - """ One-hot encoded features in accordance with features seen in the - train set. - - Args: - test_df (DataFrame): DataFrame of trips. - """ - # TODO: rename test_df, this one doesn't necessarily need to be a df - onehot_encoding = self.encoder.transform(test_df) - onehot_encoding_df = pd.DataFrame( - onehot_encoding, - columns=self.onehot_encoding_cols_all).set_index(test_df.index) - - # ignore the encoded columns for missing entries - for col in self.onehot_encoding_cols_all: - if col.endswith('_nan'): - onehot_encoding_df = onehot_encoding_df.drop(columns=[col]) - - return onehot_encoding_df.astype(int) From 585cc9038220732975894177a9a29672b1c3bffc Mon Sep 17 00:00:00 2001 From: $aTyam Date: Wed, 13 Dec 2023 00:39:43 -0500 Subject: [PATCH 19/28] Update TestForestModel.py Improving the test file by changing the way previpous predictions are stored. --- .../tests/modellingTests/TestForestModel.py | 52 +++++++++++++------ 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/emission/tests/modellingTests/TestForestModel.py b/emission/tests/modellingTests/TestForestModel.py index 58e96252b..07a52aafe 100644 --- a/emission/tests/modellingTests/TestForestModel.py +++ b/emission/tests/modellingTests/TestForestModel.py @@ -114,20 +114,38 @@ def testRandomForestRegression(self): ## predictions take the form like : # - #{'labels': {'mode_confirm': 'ebike', 'replaced_mode': 'walk', 'purpose_confirm': 'dog-park'}, 'p': 1.0} - # we can store these predictions in a json and then for every run other than the first we - # can load the predictions and compare - - try: - if os.path.exists(file_path) and os.path.getsize(file_path)>0: - with open(file_path, 'r') as f: - prev_predictions_list = json.load(f) - logging.debug() - self.assertEqual(prev_predictions_list,curr_predictions_list," previous predictions should match current predictions") - else: - with open(file_path,'w') as file: - json.dump(curr_predictions_list,file,indent=4) - logging.debug("Previous predicitons stored for future matching" ) - except json.JSONDecodeError: - logging.debug("jsonDecodeErrorError") - return " decoding JSON." \ No newline at end of file + #{'labels': {'mode_confirm': 'ebike', 'replaced_mode': 'walk', 'purpose_confirm': 'dog-park'}, 'p': 1.0} + + #Below are two ways we can store prev. predictions list . Whichever way we finalise, I'll delete the other one. + # + #Method 1 : Run predictions for the first time and hardcode them into + #prev_prdictions_list. For every iteration, simply compare them + # + # for the current data that we read from json, the predictions we get is an empty list. If + # we use a different file with more data, this'll take the for as mentioned above + # + prev_predictions_list= [ + ( + [], + -1 + ) + ] + + self.assertEqual(prev_predictions_list,curr_predictions_list," previous predictions should match current predictions") + + + #Method 2 ( which was failing): Store these predictions into a json and read from + #that json + # + # try: + # if os.path.exists(file_path) and os.path.getsize(file_path)>0: + # with open(file_path, 'r') as f: + # prev_predictions_list = json.load(f) + # logging.debug() + # self.assertEqual(prev_predictions_list,curr_predictions_list," previous predictions should match current predictions") + # else: + # with open(file_path,'w') as file: + # json.dump(curr_predictions_list,file,indent=4) + # logging.debug("Previous predicitons stored for future matching" ) + # except json.JSONDecodeError: + # logging.debug("jsonDecodeErrorError") \ No newline at end of file From 61bbe3f01158e714d392faf8c3920531e528b364 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Sat, 16 Dec 2023 12:56:52 -0500 Subject: [PATCH 20/28] Minor Fixes Fixing circular import --- emission/analysis/modelling/trip_model/models.py | 4 ++-- emission/tests/modellingTests/TestForestModelLoadandSave.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/emission/analysis/modelling/trip_model/models.py b/emission/analysis/modelling/trip_model/models.py index e5fc08b46..1cb6de655 100644 --- a/emission/analysis/modelling/trip_model/models.py +++ b/emission/analysis/modelling/trip_model/models.py @@ -19,7 +19,7 @@ from emission.analysis.modelling.trip_model.clustering import get_distance_matrix, single_cluster_purity import emission.analysis.modelling.trip_model.data_wrangling as eamtd import emission.storage.decorations.trip_queries as esdtq -from emission.analysis.classification.inference.labels.inferrers import predict_cluster_confidence_discounting +import emission.analysis.classification.inference.labels.inferrers as eacili import emission.core.wrapper.entry as ecwe import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg import emission.core.common as ecc @@ -738,7 +738,7 @@ def predict_proba(self, test_df): replaced_distribs = [] for trip in test_trips: - trip_prediction = predict_cluster_confidence_discounting(trip) + trip_prediction = eacili.predict_cluster_confidence_discounting(trip) if len(trip_prediction) == 0: # model could not find cluster for the trip diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py index dddbb160c..e7d5491b9 100644 --- a/emission/tests/modellingTests/TestForestModelLoadandSave.py +++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py @@ -1,7 +1,6 @@ from typing import ByteString import unittest import logging -import pytest from unittest.mock import patch import emission.analysis.modelling.trip_model.run_model as eamur import emission.analysis.modelling.trip_model.model_type as eamumt From a32ce4f11cd55f7fdb9297fe96196d026459705d Mon Sep 17 00:00:00 2001 From: $aTyam Date: Tue, 2 Jan 2024 18:53:25 -0500 Subject: [PATCH 21/28] [Tested]Adding Integration test --- .../TestForestModelIntegration.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 emission/tests/modellingTests/TestForestModelIntegration.py diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py new file mode 100644 index 000000000..508287d04 --- /dev/null +++ b/emission/tests/modellingTests/TestForestModelIntegration.py @@ -0,0 +1,55 @@ +# This tests the label inference pipeline. It uses real data and placeholder inference algorithms +import unittest +import numpy as np +import time +import emission.analysis.classification.inference.labels.pipeline as eacilp +import emission.analysis.classification.inference.labels.inferrers as eacili +import emission.core.wrapper.labelprediction as ecwl +import emission.storage.decorations.analysis_timeseries_queries as esda +import emission.storage.decorations.trip_queries as esdt +import emission.storage.timeseries.timequery as estt +import emission.core.get_database as edb +import emission.tests.common as etc +import logging + +class TestLabelInferencePipeline(unittest.TestCase): + # It is important that these functions be deterministic + + + def setUp(self): + + self.reset_all() + np.random.seed(91) + self.test_algorithms = eacilp.primary_algorithms + etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-07-22") ##maybe use a different file + self.run_pipeline(self.test_algorithms) + time_range = estt.TimeQuery("metadata.write_ts", None, time.time()) + self.inferred_trips = esda.get_entries(esda.INFERRED_TRIP_KEY, self.testUUID, time_query=time_range) + + def tearDown(self): + self.reset_all() + + def run_pipeline(self, algorithms): + default_primary_algorithms = eacilp.primary_algorithms + eacilp.primary_algorithms = algorithms + etc.runIntakePipeline(self.testUUID) + eacilp.primary_algorithms = default_primary_algorithms + + def reset_all(self): + etc.dropAllCollections(edb._get_current_db()) + + # Tests that each of the (test) algorithms runs and saves to the database correctly + def testIndividualAlgorithms(self): + for trip in self.inferred_trips: + entries = esdt.get_sections_for_trip("inference/labels", self.testUUID, trip.get_id()) + self.assertEqual(len(entries), len(self.test_algorithms)) + for entry in entries: + self.assertGreater(len(entry["data"]["prediction"]), 0) + + +def main(): + etc.configLogging() + unittest.main() + +if __name__ == "__main__": + main() From 052cb086d009b7bae7ad95184ba9661672d00e91 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Tue, 9 Jan 2024 21:46:55 -0800 Subject: [PATCH 22/28] Improving test --- .../modellingTests/TestForestModelIntegration.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py index 508287d04..90c6fd13b 100644 --- a/emission/tests/modellingTests/TestForestModelIntegration.py +++ b/emission/tests/modellingTests/TestForestModelIntegration.py @@ -10,6 +10,7 @@ import emission.storage.timeseries.timequery as estt import emission.core.get_database as edb import emission.tests.common as etc +import emission.pipeline.intake_stage as epi import logging class TestLabelInferencePipeline(unittest.TestCase): @@ -32,20 +33,22 @@ def tearDown(self): def run_pipeline(self, algorithms): default_primary_algorithms = eacilp.primary_algorithms eacilp.primary_algorithms = algorithms - etc.runIntakePipeline(self.testUUID) + epi.run_intake_pipeline_for_user(self.testUUID,skip_if_no_new_data = False) eacilp.primary_algorithms = default_primary_algorithms def reset_all(self): etc.dropAllCollections(edb._get_current_db()) - # Tests that each of the (test) algorithms runs and saves to the database correctly + # Tests that algorithm being tested runs and saves to the database correctly def testIndividualAlgorithms(self): + logging.debug('TEST1') for trip in self.inferred_trips: entries = esdt.get_sections_for_trip("inference/labels", self.testUUID, trip.get_id()) + logging.debug(f"ENTRIES: {entries}") self.assertEqual(len(entries), len(self.test_algorithms)) - for entry in entries: - self.assertGreater(len(entry["data"]["prediction"]), 0) - + # for entry in entries: + # self.assertGreater(len(entry["data"]["prediction"]), 0) + def main(): etc.configLogging() From 104dd9a1a0bcbcfc73719d534ddd058ff6e26e54 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Mon, 5 Feb 2024 01:17:52 -0500 Subject: [PATCH 23/28] Integration Testing for forest model The changes in this iteration are improvements in test for forest model : 1. Post discussion last week, the regression test was removed ( `TestForestModel.py` )since it won't be useful when model performance improves. Rather, the structures of predictions is checked. This check is merged with TestForestModel.py 2. After https://github.com/e-mission/e-mission-server/pull/944 , `predict_labels_with_n` in `run_model.py` expectes a lists and then iterates over it. The forest model and rest of the tests were updated accordingly. --- .../modelling/trip_model/forest_classifier.py | 10 +- .../tests/modellingTests/TestForestModel.py | 151 ------------------ .../TestForestModelIntegration.py | 94 +++++++++-- .../TestForestModelLoadandSave.py | 8 +- .../modellingTests/TestRunForestModel.py | 9 +- 5 files changed, 100 insertions(+), 172 deletions(-) delete mode 100644 emission/tests/modellingTests/TestForestModel.py diff --git a/emission/analysis/modelling/trip_model/forest_classifier.py b/emission/analysis/modelling/trip_model/forest_classifier.py index a8d1dd2de..8e066775d 100644 --- a/emission/analysis/modelling/trip_model/forest_classifier.py +++ b/emission/analysis/modelling/trip_model/forest_classifier.py @@ -103,13 +103,11 @@ def predict(self, trip: List[float]) -> Tuple[List[Dict], int]: #check if theres no trip to predict logging.debug(f"forest classifier predict called with {len(trip)} trips") if len(trip) == 0: - msg = f'model.predict cannot be called with an empty trips' + msg = f'model.predict cannot be called with an empty trip' raise Exception(msg) - # CONVERT LIST OF TRIPS TO dataFrame - test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trip) - labeled_trip_df = esdtq.filter_labeled_trips(test_df) - expanded_labeled_trip_df= esdtq.expand_userinputs(labeled_trip_df) - predcitions_df= self.model.predict(expanded_labeled_trip_df) + # CONVERT TRIP TO dataFrame + test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",[trip]) + predcitions_df= self.model.predict(test_df) # the predictions_df currently holds the highest probable options # individually in all three categories. the predictions_df are in the form diff --git a/emission/tests/modellingTests/TestForestModel.py b/emission/tests/modellingTests/TestForestModel.py deleted file mode 100644 index 07a52aafe..000000000 --- a/emission/tests/modellingTests/TestForestModel.py +++ /dev/null @@ -1,151 +0,0 @@ -import unittest -import logging -import numpy as np -import uuid -import json -import os - -import emission.analysis.modelling.trip_model.run_model as eamur -import emission.analysis.modelling.trip_model.model_type as eamumt -import emission.analysis.modelling.trip_model.model_storage as eamums -import emission.storage.json_wrappers as esj -import emission.storage.timeseries.abstract_timeseries as esta -import emission.tests.modellingTests.modellingTestAssets as etmm -import emission.storage.decorations.analysis_timeseries_queries as esda -import emission.core.get_database as edb -import emission.core.wrapper.entry as ecwe -import emission.storage.decorations.analysis_timeseries_queries as esdatq - -class TestForestModel(unittest.TestCase): - - def setUp(self): - """ - sets up the end-to-end run model test with Confirmedtrip data - """ - logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', - level=logging.DEBUG) - - self.user_id = uuid.UUID('aa9fdec9-2944-446c-8ee2-50d79b3044d3') - self.ts = esta.TimeSeries.get_time_series(self.user_id) - self.new_trips_per_invocation = 3 - self.model_type = eamumt.ModelType.RANDOM_FOREST_CLASSIFIER - self.model_storage = eamums.ModelStorage.DOCUMENT_DATABASE - sim_threshold = 500 # meters - self.forest_model_config= { - "loc_feature" : "coordinates", - "radius": 500, - "size_thresh":1, - "purity_thresh":1.0, - "gamma":0.05, - "C":1, - "n_estimators":100, - "criterion":"gini", - "max_depth":'null', - "min_samples_split":2, - "min_samples_leaf":1, - "max_features":"sqrt", - "bootstrap":True, - "random_state":42, - "use_start_clusters":False, - "use_trip_clusters":True - } - - existing_entries_for_user = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) - if len(existing_entries_for_user) != 0: - raise Exception(f"test invariant failed, there should be no entries for user {self.user_id}") - - # load in trips from a test file source - input_file = 'emission/tests/data/real_examples/shankari_2016-06-20.expected_confirmed_trips' - with open(input_file, 'r') as f: - trips_json = json.load(f, object_hook=esj.wrapped_object_hook) - self.trips = [ecwe.Entry(r) for r in trips_json] - logging.debug(f'loaded {len(self.trips)} trips from {input_file}') - - def tearDown(self): - """ - clean up database - """ - edb.get_analysis_timeseries_db().delete_many({'user_id': self.user_id}) - edb.get_model_db().delete_many({'user_id': self.user_id}) - edb.get_pipeline_state_db().delete_many({'user_id': self.user_id}) - - - - def testRandomForestRegression(self): - """ - test to ensure consistent model results. Load data for a user from json, split - into train and test. After training, we generate predictions and match them with - predictions from last time. If the code is run for the first time, the current predicitons - will be stored as ground truth. - """ - file_path= 'emission/tests/modellingTests/data.json' - split=int(0.9*len(self.trips)) - train_data= self.trips[:split] - - self.ts.bulk_insert(train_data) - - # confirm write to database succeeded - self.initial_data = list(self.ts.find_entries([esdatq.CONFIRMED_TRIP_KEY])) - if len(self.initial_data) == 0: - logging.debug(f'Writing train data failed') - self.fail() - - test_data=self.trips[split:] - logging.debug(f'LENDATA{len(train_data),len(test_data)}') - eamur.update_trip_model( - user_id=self.user_id, - model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, - model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, - min_trips=4, - model_config=self.forest_model_config - ) - model = eamur._load_stored_trip_model( - user_id=self.user_id, - model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, - model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, - model_config=self.forest_model_config - ) - - curr_predictions_list = eamur.predict_labels_with_n( - trip_list = [test_data], - model=model - ) - - - ## predictions take the form like : - # - #{'labels': {'mode_confirm': 'ebike', 'replaced_mode': 'walk', 'purpose_confirm': 'dog-park'}, 'p': 1.0} - - #Below are two ways we can store prev. predictions list . Whichever way we finalise, I'll delete the other one. - # - #Method 1 : Run predictions for the first time and hardcode them into - #prev_prdictions_list. For every iteration, simply compare them - # - # for the current data that we read from json, the predictions we get is an empty list. If - # we use a different file with more data, this'll take the for as mentioned above - # - prev_predictions_list= [ - ( - [], - -1 - ) - ] - - self.assertEqual(prev_predictions_list,curr_predictions_list," previous predictions should match current predictions") - - - #Method 2 ( which was failing): Store these predictions into a json and read from - #that json - # - # try: - # if os.path.exists(file_path) and os.path.getsize(file_path)>0: - # with open(file_path, 'r') as f: - # prev_predictions_list = json.load(f) - # logging.debug() - # self.assertEqual(prev_predictions_list,curr_predictions_list," previous predictions should match current predictions") - # else: - # with open(file_path,'w') as file: - # json.dump(curr_predictions_list,file,indent=4) - # logging.debug("Previous predicitons stored for future matching" ) - # except json.JSONDecodeError: - # logging.debug("jsonDecodeErrorError") \ No newline at end of file diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py index 90c6fd13b..89d0a639d 100644 --- a/emission/tests/modellingTests/TestForestModelIntegration.py +++ b/emission/tests/modellingTests/TestForestModelIntegration.py @@ -13,16 +13,87 @@ import emission.pipeline.intake_stage as epi import logging -class TestLabelInferencePipeline(unittest.TestCase): - # It is important that these functions be deterministic - +import emission.analysis.modelling.trip_model.run_model as eamur +import emission.analysis.modelling.trip_model.model_type as eamumt +import emission.analysis.modelling.trip_model.model_storage as eamums +import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.storage.timeseries.abstract_timeseries as esta + + +class TestForestModelIntegration(unittest.TestCase): + # Test if the forest model for label prediction is smoothly integrated with the inference pipeline. + # In the initial setup, build a dummy forest model. Then run the pipeline on real example data. + # Finally in the test, assert the type of label predictions expected. def setUp(self): self.reset_all() np.random.seed(91) self.test_algorithms = eacilp.primary_algorithms + + forest_model_config= { + "loc_feature" : "coordinates", + "radius": 500, + "size_thresh":1, + "purity_thresh":1.0, + "gamma":0.05, + "C":1, + "n_estimators":100, + "criterion":"gini", + "max_depth":'null', + "min_samples_split":2, + "min_samples_leaf":1, + "max_features":"sqrt", + "bootstrap":True, + "random_state":42, + "use_start_clusters":False, + "use_trip_clusters":True + } etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-07-22") ##maybe use a different file + ts = esta.TimeSeries.get_time_series(self.testUUID) + label_data = { + "mode_confirm": ['ebike', 'bike'], + "purpose_confirm": ['happy-hour', 'dog-park'], + "replaced_mode": ['walk'], + "mode_weights": [0.9, 0.1], + "purpose_weights": [0.1, 0.9] + } + + self.origin = (-105.1705977, 39.7402654,) + self.destination = (-105.1755606, 39.7673075) + self.min_trips = 14 + self.total_trips = 100 + self.clustered_trips = 33 + self.has_label_percent = 0.9 + ## generate mock trips + train = etmm.generate_mock_trips( + user_id=self.testUUID, + trips=self.total_trips, + origin=self.origin, + destination=self.destination, + trip_part='od', + label_data=label_data, + within_threshold=self.clustered_trips, + threshold=0.004, # ~400m + has_label_p=self.has_label_percent + ) + ts.bulk_insert(train) + # confirm data write did not fail + check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None) + if len(check_data) != self.total_trips: + logging.debug(f'test invariant failed after generating test data') + self.fail() + else: + logging.debug(f'found {self.total_trips} trips in database') + ## Build an already existing model or a new model + eamur.update_trip_model( + user_id=self.testUUID, + model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, + model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, + min_trips=4, + model_config=forest_model_config + ) + ## run inference pipeline self.run_pipeline(self.test_algorithms) time_range = estt.TimeQuery("metadata.write_ts", None, time.time()) self.inferred_trips = esda.get_entries(esda.INFERRED_TRIP_KEY, self.testUUID, time_query=time_range) @@ -39,16 +110,19 @@ def run_pipeline(self, algorithms): def reset_all(self): etc.dropAllCollections(edb._get_current_db()) - # Tests that algorithm being tested runs and saves to the database correctly - def testIndividualAlgorithms(self): - logging.debug('TEST1') + # Tests that forest algorithm being tested runs successfully + def testForestAlgorithm(self): for trip in self.inferred_trips: entries = esdt.get_sections_for_trip("inference/labels", self.testUUID, trip.get_id()) - logging.debug(f"ENTRIES: {entries}") self.assertEqual(len(entries), len(self.test_algorithms)) - # for entry in entries: - # self.assertGreater(len(entry["data"]["prediction"]), 0) - + for entry in entries: + self.assertGreater(len(entry["data"]["prediction"]), 0) + for singleprediction in entry["data"]["prediction"]: + self.assertIsInstance(singleprediction, dict, " should be an instance of the dictionary class") + self.assertIsInstance(singleprediction['labels'], dict, " should be an instance of the dictionary class") + self.assertIn('mode_confirm',singleprediction['labels'].keys()) + self.assertIn('replaced_mode',singleprediction['labels'].keys()) + self.assertIn('purpose_confirm',singleprediction['labels'].keys()) def main(): etc.configLogging() diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py index e7d5491b9..8da1fce5b 100644 --- a/emission/tests/modellingTests/TestForestModelLoadandSave.py +++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py @@ -134,7 +134,7 @@ def testForestModelRoundTrip(self): # logging.debug(f'Predictions on trips in database') predictions_list = eamur.predict_labels_with_n( - trip_list = [test], + trip_list = test, model=model ) @@ -151,7 +151,7 @@ def testForestModelRoundTrip(self): # logging.debug(f'Predictions on trips using deserialised model') predictions_loaded_model_list = eamur.predict_labels_with_n( - trip_list = [test], + trip_list = test, model=deserialized_model ) # logging.debug(f'Assert that both predictions are the same') @@ -184,7 +184,7 @@ def testForestModelConsistency(self): # logging.debug(f' Model Predictions on trips in database') predictions_list_model1 = eamur.predict_labels_with_n( - trip_list = [test], + trip_list = test, model=model_iter1 ) # logging.debug(f' Loading Model again') @@ -197,7 +197,7 @@ def testForestModelConsistency(self): ) # logging.debug(f' Model Predictions on trips in database') predictions_list_model2 = eamur.predict_labels_with_n( - trip_list = [test], + trip_list = test, model=model_iter2 ) diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py index 2ca48c4f4..1676e878d 100644 --- a/emission/tests/modellingTests/TestRunForestModel.py +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -183,9 +183,16 @@ def test1RoundPredictForestModel(self): ) predictions_list = eamur.predict_labels_with_n( - trip_list = [test], + trip_list = test, model=model ) for prediction, n in predictions_list: [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)] self.assertNotEqual(len(prediction), 0, "should have a prediction") + self.assertIn('labels',prediction[0].keys()) + self.assertIn('p',prediction[0].keys()) + self.assertIsInstance(prediction[0], dict, " should be an instance of the dictionary class") + self.assertIsInstance(prediction[0]['labels'], dict, " should be an instance of the dictionary class") + self.assertIn('mode_confirm',prediction[0]['labels'].keys()) + self.assertIn('replaced_mode',prediction[0]['labels'].keys()) + self.assertIn('purpose_confirm',prediction[0]['labels'].keys()) \ No newline at end of file From 1b523edf7ebd18ec9c8b62cb333e13ff4045c31b Mon Sep 17 00:00:00 2001 From: $aTyam Date: Fri, 15 Mar 2024 13:42:09 -0400 Subject: [PATCH 24/28] [Tested] Improvements for model integration 1. Improved tests in `TestForestModelLoadandSave.py` 2. Better comments, imports nd cleanup --- .../modelling/trip_model/dbscan_svm.py | 250 ------------------ .../modelling/trip_model/forest_classifier.py | 91 ++++--- .../analysis/modelling/trip_model/models.py | 66 ++--- .../modelling/trip_model/run_model.py | 1 - .../analysis/modelling/trip_model/util.py | 2 - .../TestForestModelIntegration.py | 44 +-- .../TestForestModelLoadandSave.py | 96 +++---- .../modellingTests/TestRunForestModel.py | 24 +- 8 files changed, 163 insertions(+), 411 deletions(-) delete mode 100644 emission/analysis/modelling/trip_model/dbscan_svm.py diff --git a/emission/analysis/modelling/trip_model/dbscan_svm.py b/emission/analysis/modelling/trip_model/dbscan_svm.py deleted file mode 100644 index 58cd8f7e0..000000000 --- a/emission/analysis/modelling/trip_model/dbscan_svm.py +++ /dev/null @@ -1,250 +0,0 @@ -import emission.analysis.modelling.trip_model.trip_model as eamuu -from sklearn.cluster import DBSCAN -import logging -import numpy as np -import pandas as pd -import emission.analysis.modelling.trip_model.util as eamtu -from sklearn.preprocessing import StandardScaler -from sklearn.pipeline import make_pipeline -from sklearn import svm -from sklearn.metrics.pairwise import haversine_distances - -EARTH_RADIUS = 6371000 - -class DBSCANSVMCluster(eamuu.TripModel): - """ DBSCAN-based clustering algorithm that optionally implements SVM - sub-clustering. - - Args: - loc_type (str): 'start' or 'end', the type of point to cluster - radius (int): max distance between two points in each other's - neighborhood, i.e. DBSCAN's eps value. does not strictly - dictate final cluster size - size_thresh (int): the min number of trips a cluster must have - to be considered for SVM sub-division - purity_thresh (float): the min purity a cluster must have - to be sub-divided using SVM - gamma (float): coefficient for the rbf kernel in SVM - C (float): regularization hyperparameter for SVM - - Attributes: - loc_type (str) - radius (int) - size_thresh (int) - purity_thresh (float) - gamma (float) - C (float) - train_df (DataFrame) - test_df (DataFrame) - base_model (sklearn Estimator) - """ - - def __init__(self, - loc_type='end', - radius=100, - svm=True, - size_thresh=1, - purity_thresh=1.0, - gamma=0.05, - C=1): - logging.info("PERF: Initializing DBSCANSVMCluster") - self.loc_type = loc_type - self.radius = radius - self.svm = svm - self.size_thresh = size_thresh - self.purity_thresh = purity_thresh - self.gamma = gamma - self.C = C - - def set_params(self, params): - if 'loc_type' in params.keys(): self.loc_type = params['loc_type'] - if 'radius' in params.keys(): self.radius = params['radius'] - if 'svm' in params.keys(): self.svm = params['svm'] - if 'size_thresh' in params.keys(): - self.size_thresh = params['size_thresh'] - if 'purity_thresh' in params.keys(): - self.purity_thresh = params['purity_thresh'] - if 'gamma' in params.keys(): self.gamma = params['gamma'] - - return self - - def fit(self, train_df,ct_entry=None): - """ Creates clusters of trip points. - self.train_df will be updated with columns containing base and - final clusters. - - TODO: perhaps move the loc_type argument to fit() so we can use a - single class instance to cluster both start and end points. This - will also help us reduce duplicate data. - - Args: - train_df (dataframe): dataframe of labeled trips - ct_entry (List) : A list of Entry type of labeled and unlabeled trips - """ - ################## - ### clean data ### - ################## - logging.info("PERF: Fitting DBSCANSVMCluster") - self.train_df = self._clean_data(train_df) - - # we can use all trips as long as they have purpose labels. it's ok if - # they're missing mode/replaced-mode labels, because they aren't as - # strongly correlated with location compared to purpose - # TODO: actually, we may want to rethink this. for example, it will - # probably be helpful to include trips that are missing purpose labels - # but still have mode labels. - if self.train_df.purpose_true.isna().any(): - num_nan = self.train_df.purpose_true.value_counts( - dropna=False).loc[np.nan] - logging.info( - f'dropping {num_nan}/{len(self.train_df)} trips that are missing purpose labels' - ) - self.train_df = self.train_df.dropna( - subset=['purpose_true']).reset_index(drop=True) - if len(self.train_df) == 0: - # i.e. no valid trips after removing all nans - raise Exception('no valid trips; nothing to fit') - - ######################### - ### get base clusters ### - ######################### - dist_matrix_meters = eamtu.get_distance_matrix(self.train_df, self.loc_type) - self.base_model = DBSCAN(self.radius, - metric="precomputed", - min_samples=1).fit(dist_matrix_meters) - base_clusters = self.base_model.labels_ - - self.train_df.loc[:, - f'{self.loc_type}_base_cluster_idx'] = base_clusters - - ######################## - ### get sub-clusters ### - ######################## - # copy base cluster column into final cluster column - self.train_df.loc[:, f'{self.loc_type}_cluster_idx'] = self.train_df[ - f'{self.loc_type}_base_cluster_idx'] - - if self.svm: - c = 0 # count of how many clusters we have iterated over - - # iterate over all clusters and subdivide them with SVM. the while - # loop is so we can do multiple iterations of subdividing if needed - while c < self.train_df[f'{self.loc_type}_cluster_idx'].max(): - points_in_cluster = self.train_df[ - self.train_df[f'{self.loc_type}_cluster_idx'] == c] - - # only do SVM if we have the minimum num of trips in the cluster - if len(points_in_cluster) < self.size_thresh: - c += 1 - continue - - # only do SVM if purity is below threshold - purity = eamtu.single_cluster_purity(points_in_cluster, - label_col='purpose_true') - if purity < self.purity_thresh: - X = points_in_cluster[[ - f"{self.loc_type}_lon", f"{self.loc_type}_lat" - ]] - y = points_in_cluster.purpose_true.to_list() - - svm_model = make_pipeline( - StandardScaler(), - svm.SVC( - kernel='rbf', - gamma=self.gamma, - C=self.C, - )).fit(X, y) - labels = svm_model.predict(X) - unique_labels = np.unique(labels) - - # if the SVM predicts that all points in the cluster have - # the same label, just ignore it and don't reindex. - # this also helps us to handle the possibility that a - # cluster may be impure but inherently inseparable, e.g. an - # end cluster at a user's home, containing 50% trips from - # work to home and 50% round trips that start and end at - # home. we don't want to reindex otherwise the low purity - # will trigger SVM again, and we will attempt & fail to - # split the cluster ad infinitum - if len(unique_labels) > 1: - # map purpose labels to new cluster indices - # we offset indices by the max existing index so that we - # don't run into any duplicate indices - max_existing_idx = self.train_df[ - f'{self.loc_type}_cluster_idx'].max() - label_to_cluster = { - unique_labels[i]: i + max_existing_idx + 1 - for i in range(len(unique_labels)) - } - # update trips with their new cluster indices - indices = np.array( - [label_to_cluster[l] for l in labels]) - self.train_df.loc[ - self.train_df[f'{self.loc_type}_cluster_idx'] == c, - f'{self.loc_type}_cluster_idx'] = indices - - c += 1 - # TODO: make things categorical at the end? or maybe at the start of the decision tree pipeline - - return self - - def fit_predict(self, train_df): - """ Override to avoid unnecessarily computation of distance matrices. - """ - self.fit(train_df) - return self.train_df[[f'{self.loc_type}_cluster_idx']] - - def predict(self, test_df): - logging.info("PERF: Predicting DBSCANSVMCluster") - # TODO: store clusters as polygons so the prediction is faster - # TODO: we probably don't want to store test_df in self to be more memory-efficient - self.test_df = self._clean_data(test_df) - pred_clusters = self._NN_predict(self.test_df) - - self.test_df.loc[:, f'{self.loc_type}_cluster_idx'] = pred_clusters - - return self.test_df[[f'{self.loc_type}_cluster_idx']] - - def _NN_predict(self, test_df): - """ Generate base-cluster predictions for the test data using a - nearest-neighbor approach. - - sklearn doesn't implement predict() for DBSCAN, which is why we - need a custom method. - """ - logging.info("PERF: NN_predicting DBSCANSVMCluster") - n_samples = test_df.shape[0] - labels = np.ones(shape=n_samples, dtype=int) * -1 - - # get coordinates of core points (we can't use model.components_ - # because our input feature was a distance matrix and doesn't contain - # info about the raw coordinates) - # NOTE: technically, every single point in a cluster is a core point - # because it has at least minPts (2) points, including itself, in its - # radius - train_coordinates = self.train_df[[ - f'{self.loc_type}_lat', f'{self.loc_type}_lon' - ]] - train_radians = np.radians(train_coordinates) - - for idx, row in test_df.reset_index(drop=True).iterrows(): - # calculate the distances between the ith test data and all points, - # then find the index of the closest point. if the ith test data is - # within epsilon of the point, then assign its cluster to the ith - # test data (otherwise, leave it as -1, indicating noise). - # unfortunately, pairwise_distances_argmin() does not support - # haversine distance, so we have to reimplement it ourselves - new_loc_radians = np.radians( - row[[self.loc_type + "_lat", self.loc_type + "_lon"]].to_list()) - new_loc_radians = np.reshape(new_loc_radians, (1, 2)) - dist_matrix_meters = haversine_distances( - new_loc_radians, train_radians) * EARTH_RADIUS - - shortest_dist_idx = np.argmin(dist_matrix_meters) - if dist_matrix_meters[0, shortest_dist_idx] < self.radius: - labels[idx] = self.train_df.reset_index( - drop=True).loc[shortest_dist_idx, - f'{self.loc_type}_cluster_idx'] - - return labels - diff --git a/emission/analysis/modelling/trip_model/forest_classifier.py b/emission/analysis/modelling/trip_model/forest_classifier.py index 8e066775d..16eee014f 100644 --- a/emission/analysis/modelling/trip_model/forest_classifier.py +++ b/emission/analysis/modelling/trip_model/forest_classifier.py @@ -1,17 +1,16 @@ -import pandas as pd -from sklearn.preprocessing import OneHotEncoder import joblib from typing import Dict, List, Optional, Tuple -from sklearn.metrics.pairwise import haversine_distances +import sklearn.metrics.pairwise as smp import emission.core.wrapper.confirmedtrip as ecwc import logging from io import BytesIO +import json import emission.analysis.modelling.trip_model.trip_model as eamuu import emission.analysis.modelling.trip_model.config as eamtc import emission.storage.timeseries.builtin_timeseries as estb import emission.storage.decorations.trip_queries as esdtq -from emission.analysis.modelling.trip_model.models import ForestClassifier +import emission.analysis.modelling.trip_model.models as eamtm EARTH_RADIUS = 6371000 @@ -33,45 +32,33 @@ def __init__(self,config=None): 'min_samples_leaf', 'max_features', 'bootstrap', - ] - cluster_expected_keys= [ - 'radius', - 'size_thresh', - 'purity_thresh', - 'gamma', - 'C', - 'use_start_clusters', - 'use_trip_clusters', - ] - + ] + ######### Not Tested ######### + # The below code is used when we cluster the coordinates (loc_cluster parameter = True) + # before passing to Random Forest. Commenting this for now since it is not used. Not tested either. + ############################### + + # cluster_expected_keys= [ + # 'radius', + # 'size_thresh', + # 'purity_thresh', + # 'gamma', + # 'C', + # 'use_start_clusters', + # 'use_trip_clusters', + # ] + # + # if config['loc_feature'] == 'cluster': + # for k in cluster_expected_keys: + # if config.get(k) is None: + # msg = f"cluster trip model config missing expected key {k}" + # raise KeyError(msg) + ####################################### for k in random_forest_expected_keys: if config.get(k) is None: msg = f"forest trip model config missing expected key {k}" raise KeyError(msg) - - if config['loc_feature'] == 'cluster': - for k in cluster_expected_keys: - if config.get(k) is None: - msg = f"cluster trip model config missing expected key {k}" - raise KeyError(msg) - maxdepth =config['max_depth'] if config['max_depth']!='null' else None - self.model=ForestClassifier( loc_feature=config['loc_feature'], - radius= config['radius'], - size_thresh=config['radius'], - purity_thresh=config['purity_thresh'], - gamma=config['gamma'], - C=config['C'], - n_estimators=config['n_estimators'], - criterion=config['criterion'], - max_depth=maxdepth, - min_samples_split=config['min_samples_split'], - min_samples_leaf=config['min_samples_leaf'], - max_features=config['max_features'], - bootstrap=config['bootstrap'], - random_state=config['random_state'], - # drop_unclustered=False, - use_start_clusters=config['use_start_clusters'], - use_trip_clusters=config['use_trip_clusters']) + self.model=eamtm.ForestClassifier(**config) def fit(self,trips: List[ecwc.Confirmedtrip]): @@ -139,9 +126,15 @@ def to_dict(self): """ data={} attr=[ 'purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df'] - if self.model.loc_feature == 'cluster': - ## confirm this includes all the extra encoders/models - attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper']) + + ######### Not Tested ######### + # The below code is used when we cluster the coordinates (loc_cluster parameter = True) + # before passing to Random Forest. Commenting this for now since it is not used. Not tested either. + ############################### + # if self.model.loc_feature == 'cluster': + # ## confirm this includes all the extra encoders/models + # attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper']) + for attribute_name in attr: if not hasattr(self.model,attribute_name): raise ValueError(f"Attribute {attribute_name} not found in the model") @@ -153,7 +146,7 @@ def to_dict(self): raise RuntimeError(f"Error serializing { attribute_name}: {str(e)}") buffer.seek(0) data[attribute_name]=buffer.getvalue() - + return data def from_dict(self,model: Dict): @@ -161,9 +154,14 @@ def from_dict(self,model: Dict): Load the model from a dictionary. """ attr=[ 'purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df'] - if self.model.loc_feature == 'cluster': - ## TODO : confirm this includes all the extra encoders/models - attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper']) + + ######### Not Tested ######### + # The below code is used when we cluster the coordinates (loc_cluster parameter = True) + # before passing to Random Forest. Commenting this for now since it is not used. Not tested either. + ############################### + # if self.model.loc_feature == 'cluster': + # ## TODO : confirm this includes all the extra encoders/models + # attr.extend([ 'cluster_enc','end_cluster_model','start_cluster_model','trip_grouper']) for attribute_name in attr: if attribute_name not in model: raise ValueError(f"Attribute {attribute_name} missing in the model") @@ -183,6 +181,7 @@ def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: :return: a vector containing features to predict from :rtype: List[float] """ + # ForestClassifier class in models.py file handles features extraction. pass def is_incremental(self) -> bool: diff --git a/emission/analysis/modelling/trip_model/models.py b/emission/analysis/modelling/trip_model/models.py index 1cb6de655..cc3b58a2e 100644 --- a/emission/analysis/modelling/trip_model/models.py +++ b/emission/analysis/modelling/trip_model/models.py @@ -1514,7 +1514,7 @@ def __init__( self.C = C self.n_estimators = n_estimators self.criterion = criterion - self.max_depth = max_depth + self.max_depth = max_depth if max_depth!= 'null' else None self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_features = max_features @@ -1524,36 +1524,42 @@ def __init__( self.use_start_clusters = use_start_clusters self.use_trip_clusters = use_trip_clusters - if self.loc_feature == 'cluster': - # clustering algorithm to generate end clusters - self.end_cluster_model = DBSCANSVMCluster( - loc_type='end', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_start_clusters or self.use_trip_clusters: - # clustering algorithm to generate start clusters - self.start_cluster_model = DBSCANSVMCluster( - loc_type='start', - radius=self.radius, - size_thresh=self.size_thresh, - purity_thresh=self.purity_thresh, - gamma=self.gamma, - C=self.C) - - if self.use_trip_clusters: - # helper class to generate trip-level clusters - self.trip_grouper = TripGrouper( - start_cluster_col='start_cluster_idx', - end_cluster_col='end_cluster_idx') - - # wrapper class to generate one-hot encodings for cluster indices - self.cluster_enc = OneHotWrapper(sparse=False, - handle_unknown='ignore') + ######### Not Tested ######### + # The below code is used when we cluster the coordinates (loc_cluster parameter = True) + # before passing to Random Forest. Commenting this for now since it is not tested. + ############################### + # if self.loc_feature == 'cluster': + # # clustering algorithm to generate end clusters + # self.end_cluster_model = DBSCANSVMCluster( + # loc_type='end', + # radius=self.radius, + # size_thresh=self.size_thresh, + # purity_thresh=self.purity_thresh, + # gamma=self.gamma, + # C=self.C) + + # if self.use_start_clusters or self.use_trip_clusters: + # # clustering algorithm to generate start clusters + # self.start_cluster_model = DBSCANSVMCluster( + # loc_type='start', + # radius=self.radius, + # size_thresh=self.size_thresh, + # purity_thresh=self.purity_thresh, + # gamma=self.gamma, + # C=self.C) + + # if self.use_trip_clusters: + # # helper class to generate trip-level clusters + # self.trip_grouper = TripGrouper( + # start_cluster_col='start_cluster_idx', + # end_cluster_col='end_cluster_idx') + + # # wrapper class to generate one-hot encodings for cluster indices + # self.cluster_enc = OneHotWrapper(sparse=False, + # handle_unknown='ignore') + ############################################################################# + # wrapper class to generate one-hot encodings for purposes and modes self.purpose_enc = OneHotWrapper(impute_missing=True, sparse=False, diff --git a/emission/analysis/modelling/trip_model/run_model.py b/emission/analysis/modelling/trip_model/run_model.py index f27457c60..cfee60464 100644 --- a/emission/analysis/modelling/trip_model/run_model.py +++ b/emission/analysis/modelling/trip_model/run_model.py @@ -56,7 +56,6 @@ def update_trip_model( time_query = time_query_from_pipeline if model.is_incremental else None logging.debug(f'model type {model_type.name} is incremental? {model.is_incremental}') logging.debug(f'time query for training data collection: {time_query}') - trips = _get_training_data(user_id, time_query) # don't start training for a user that doesn't have at least $trips many trips # (assume if a stored model exists for the user, that they met this requirement previously) diff --git a/emission/analysis/modelling/trip_model/util.py b/emission/analysis/modelling/trip_model/util.py index b3a9a012c..b3da1d4a1 100644 --- a/emission/analysis/modelling/trip_model/util.py +++ b/emission/analysis/modelling/trip_model/util.py @@ -7,10 +7,8 @@ def find_knee_point(values: List[float]) -> Tuple[float, int]: """for a list of values, find the value which represents the cut-off point or "elbow" in the function when values are sorted. - copied from original similarity algorithm. permalink: [https://github.com/e-mission/e-mission-server/blob/5b9e608154de15e32df4f70a07a5b95477e7dbf5/emission/analysis/modelling/tour_model/similarity.py#L256] - with `y` passed in as `values` based on this stack overflow answer: https://stackoverflow.com/a/2022348/4803266 And summarized by the statement: "A quick way of finding the elbow is to draw a diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py index 89d0a639d..fad84662b 100644 --- a/emission/tests/modellingTests/TestForestModelIntegration.py +++ b/emission/tests/modellingTests/TestForestModelIntegration.py @@ -12,6 +12,7 @@ import emission.tests.common as etc import emission.pipeline.intake_stage as epi import logging +import emission.analysis.modelling.trip_model.config as eamtc import emission.analysis.modelling.trip_model.run_model as eamur import emission.analysis.modelling.trip_model.model_type as eamumt @@ -26,29 +27,10 @@ class TestForestModelIntegration(unittest.TestCase): # Finally in the test, assert the type of label predictions expected. def setUp(self): - - self.reset_all() np.random.seed(91) self.test_algorithms = eacilp.primary_algorithms - - forest_model_config= { - "loc_feature" : "coordinates", - "radius": 500, - "size_thresh":1, - "purity_thresh":1.0, - "gamma":0.05, - "C":1, - "n_estimators":100, - "criterion":"gini", - "max_depth":'null', - "min_samples_split":2, - "min_samples_leaf":1, - "max_features":"sqrt", - "bootstrap":True, - "random_state":42, - "use_start_clusters":False, - "use_trip_clusters":True - } + forest_model_config = eamtc.get_config_value_or_raise('model_parameters.forest') + etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-07-22") ##maybe use a different file ts = esta.TimeSeries.get_time_series(self.testUUID) label_data = { @@ -59,23 +41,18 @@ def setUp(self): "purpose_weights": [0.1, 0.9] } - self.origin = (-105.1705977, 39.7402654,) - self.destination = (-105.1755606, 39.7673075) - self.min_trips = 14 - self.total_trips = 100 - self.clustered_trips = 33 - self.has_label_percent = 0.9 + self.total_trips=100 ## generate mock trips train = etmm.generate_mock_trips( user_id=self.testUUID, trips=self.total_trips, - origin=self.origin, - destination=self.destination, + origin=(-105.1705977, 39.7402654), + destination=(-105.1755606, 39.7673075), trip_part='od', label_data=label_data, - within_threshold=self.clustered_trips, + within_threshold= 33, threshold=0.004, # ~400m - has_label_p=self.has_label_percent + has_label_p=0.9 ) ts.bulk_insert(train) # confirm data write did not fail @@ -108,7 +85,10 @@ def run_pipeline(self, algorithms): eacilp.primary_algorithms = default_primary_algorithms def reset_all(self): - etc.dropAllCollections(edb._get_current_db()) + edb.get_analysis_timeseries_db().delete_many({'user_id': self.testUUID}) + edb.get_model_db().delete_many({'user_id': self.testUUID}) + edb.get_pipeline_state_db().delete_many({'user_id': self.testUUID}) + # Tests that forest algorithm being tested runs successfully def testForestAlgorithm(self): diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py index 8da1fce5b..37768a689 100644 --- a/emission/tests/modellingTests/TestForestModelLoadandSave.py +++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py @@ -5,14 +5,13 @@ import emission.analysis.modelling.trip_model.run_model as eamur import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.model_storage as eamums - +import emission.analysis.modelling.trip_model.config as eamtc +import uuid import emission.storage.timeseries.abstract_timeseries as esta import emission.tests.modellingTests.modellingTestAssets as etmm import emission.storage.decorations.analysis_timeseries_queries as esda import emission.core.get_database as edb -import emission.storage.pipeline_queries as epq -import emission.core.wrapper.pipelinestate as ecwp - +import emission.analysis.modelling.trip_model.run_model as eamtr class TestForestModelLoadandSave(unittest.TestCase): """ @@ -40,7 +39,7 @@ def setUp(self): # for a negative test, below self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' - # test data can be saved between test invocations, check if data exists before generating + # Ensuring that no previous test data was left in DB after teardown, ts = esta.TimeSeries.get_time_series(user_id) test_data = list(ts.find_entries(["analysis/confirmed_trip"])) if len(test_data) == 0: @@ -56,7 +55,7 @@ def setUp(self): "purpose_weights": [0.1, 0.9] } - train = etmm.generate_mock_trips( + test_data = etmm.generate_mock_trips( user_id=user_id, trips=self.total_trips, origin=self.origin, @@ -68,7 +67,7 @@ def setUp(self): has_label_p=self.has_label_percent ) - ts.bulk_insert(train) + ts.bulk_insert(test_data) # confirm data write did not fail test_data = esda.get_entries(key="analysis/confirmed_trip", user_id=user_id, time_query=None) @@ -78,24 +77,7 @@ def setUp(self): else: logging.debug(f'found {self.total_trips} trips in database') - self.forest_model_config= { - "loc_feature" : "coordinates", - "radius": 500, - "size_thresh":1, - "purity_thresh":1.0, - "gamma":0.05, - "C":1, - "n_estimators":100, - "criterion":"gini", - "max_depth":'null', - "min_samples_split":2, - "min_samples_leaf":1, - "max_features":"sqrt", - "bootstrap":True, - "random_state":42, - "use_start_clusters":False, - "use_trip_clusters":True - } + self.forest_model_config= eamtc.get_config_value_or_raise('model_parameters.forest') def tearDown(self): """ @@ -283,31 +265,51 @@ def mock_load(*args,**kwargs): def testRandomForestTypePreservation(self): """ TypePreservationTest: To ensure that the serialization and deserialization - process maintains the data types of all model attributes. + process maintains the data types of all model attributes. + The model is trained, preditions stored, serialised and then desserialized. + The type of deserialised model attributes and the predictions of this must mast initial + serialised model. """ + ## Get trips for a user + test_user=uuid.UUID('feb6a3a8-a2ef-4f4a-8754-bd79f7154495') + ct_entry=eamtr._get_training_data(test_user,None) - logging.debug(f'(TRAIN) creating a model based on trips in database') - eamur.update_trip_model( - user_id=self.user_id, - model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, - model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, - min_trips=self.min_trips, - model_config=self.forest_model_config + split= int(len(ct_entry)*0.8) + trips=ct_entry[:split] + test_trips=ct_entry[split:] + + ## Build and train model + model_type= eamumt.ModelType.RANDOM_FOREST_CLASSIFIER + model = model_type.build(self.forest_model_config) + model.fit(trips) + + ## Get pre serialization predictions + predictions_list = eamur.predict_labels_with_n( + trip_list = test_trips, + model=model ) - - model = eamur._load_stored_trip_model( - user_id=self.user_id, - model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER, - model_storage=eamums.ModelStorage.DOCUMENT_DATABASE, - model_config=self.forest_model_config - ) - - model_data=model.to_dict() - loaded_model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER - loaded_model = loaded_model_type.build(self.forest_model_config) - loaded_model.from_dict(model_data) - + ## Serialise + serialised_model_data=model.to_dict() + + ## build and deserialise a different model + deserialised_model = model_type.build(self.forest_model_config) + deserialised_model.from_dict(serialised_model_data) + + ## test if the types are correct for attr in ['purpose_predictor','mode_predictor','replaced_predictor','purpose_enc','mode_enc','train_df']: - assert isinstance(getattr(loaded_model.model,attr),type(getattr(model.model,attr))) + deSerialised_attr_value=getattr(deserialised_model.model,attr) + original_attr_value=getattr(model.model,attr) + #Check type preservation + self.assertIsInstance(deSerialised_attr_value,type(original_attr_value), f"Type mismatch for {attr} ") + #Check for value equality. This assumes that the attributes are either direc + + ## test if the predictions are correct + deserialised_predictions_list = eamur.predict_labels_with_n( + trip_list = test_trips, + model=deserialised_model + ) + logging.debug(f'TESTIN:{deserialised_predictions_list}') + logging.debug(f'{predictions_list}') + self.assertEqual(deserialised_predictions_list,predictions_list,'predictions list not same.') diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py index 1676e878d..6a81a0cb0 100644 --- a/emission/tests/modellingTests/TestRunForestModel.py +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -4,14 +4,17 @@ import emission.analysis.modelling.trip_model.run_model as eamur import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.model_storage as eamums - +import emission.analysis.modelling.trip_model.models as eamtm +logger=logging.getLogger("") +logger.setLevel(logging.DEBUG) import emission.storage.timeseries.abstract_timeseries as esta import emission.tests.modellingTests.modellingTestAssets as etmm import emission.storage.decorations.analysis_timeseries_queries as esda import emission.core.get_database as edb import emission.storage.pipeline_queries as epq import emission.core.wrapper.pipelinestate as ecwp - +import emission.analysis.modelling.trip_model.forest_classifier as eamtf +from sklearn.ensemble import RandomForestClassifier class TestRunForestModel(unittest.TestCase): """ @@ -67,6 +70,10 @@ def setUp(self): threshold=0.004, # ~400m has_label_p=self.has_label_percent ) + #values required by forest model + for entry in train: + entry['data']['start_local_dt']=entry['metadata']['write_local_dt'] + entry['data']['end_local_dt']=entry['metadata']['write_local_dt'] ts.bulk_insert(train) @@ -92,7 +99,18 @@ def testBuildForestModelFromConfig(self): purposes but will load from a file in /conf/analysis/ which is tested here """ - eamumt.ModelType.RANDOM_FOREST_CLASSIFIER.build() + built_model = eamumt.ModelType.RANDOM_FOREST_CLASSIFIER.build() + attributes={'purpose_predictor': RandomForestClassifier , + 'mode_predictor' :RandomForestClassifier, + 'replaced_predictor':RandomForestClassifier, + 'purpose_enc' : eamtm.OneHotWrapper, + 'mode_enc':eamtm.OneHotWrapper + } + self.assertIsInstance(built_model,eamtf.ForestClassifierModel) + for attr in attributes: + #logging.debug(f'{attr,attributes[attr]}') + x=getattr(built_model.model,attr) + self.assertIsInstance(x, attributes[attr]) # success if it didn't throw def testTrainForestModelWithZeroTrips(self): From 35a134631cd3290d2f4ffe2b08149a96126f7989 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Wed, 20 Mar 2024 23:44:36 -0400 Subject: [PATCH 25/28] Forest Model related data additions While testing model integration, 2 forest model features specific features are added in the `TestForestModelIntegration.py` file rather than in `entry.py` file. --- emission/core/wrapper/entry.py | 3 --- emission/tests/modellingTests/TestForestModelIntegration.py | 5 +++++ emission/tests/modellingTests/TestRunForestModel.py | 2 -- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/emission/core/wrapper/entry.py b/emission/core/wrapper/entry.py index a11eaac8c..b4d8520f7 100644 --- a/emission/core/wrapper/entry.py +++ b/emission/core/wrapper/entry.py @@ -182,9 +182,6 @@ def create_fake_entry(user_id, key, data, write_ts, create_id=False): result_entry.user_id = user_id result_entry.metadata = ecwm.Metadata.create_metadata_for_fake_result(key, write_ts) result_entry.data = data - #necessary values required by forest model - result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt'] - result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt'] result_entry._populateDependencies() return result_entry diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py index fad84662b..44fe85afe 100644 --- a/emission/tests/modellingTests/TestForestModelIntegration.py +++ b/emission/tests/modellingTests/TestForestModelIntegration.py @@ -54,6 +54,11 @@ def setUp(self): threshold=0.004, # ~400m has_label_p=0.9 ) + + for result_entry in train: + result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt'] + result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt'] + ts.bulk_insert(train) # confirm data write did not fail check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None) diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py index 6a81a0cb0..8c6cd1650 100644 --- a/emission/tests/modellingTests/TestRunForestModel.py +++ b/emission/tests/modellingTests/TestRunForestModel.py @@ -5,8 +5,6 @@ import emission.analysis.modelling.trip_model.model_type as eamumt import emission.analysis.modelling.trip_model.model_storage as eamums import emission.analysis.modelling.trip_model.models as eamtm -logger=logging.getLogger("") -logger.setLevel(logging.DEBUG) import emission.storage.timeseries.abstract_timeseries as esta import emission.tests.modellingTests.modellingTestAssets as etmm import emission.storage.decorations.analysis_timeseries_queries as esda From 19bb394c8b2506036102f8008b05648e3c4c0ebb Mon Sep 17 00:00:00 2001 From: $aTyam Date: Fri, 22 Mar 2024 03:16:35 -0400 Subject: [PATCH 26/28] Update TestForestModelIntegration.py 2 more ( total 4) Forest model specific features are now added after generating random trips for testing purpose. --- .../tests/modellingTests/TestForestModelIntegration.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py index 44fe85afe..88813b5c4 100644 --- a/emission/tests/modellingTests/TestForestModelIntegration.py +++ b/emission/tests/modellingTests/TestForestModelIntegration.py @@ -12,6 +12,8 @@ import emission.tests.common as etc import emission.pipeline.intake_stage as epi import logging +from bson.objectid import ObjectId + import emission.analysis.modelling.trip_model.config as eamtc import emission.analysis.modelling.trip_model.run_model as eamur @@ -54,11 +56,12 @@ def setUp(self): threshold=0.004, # ~400m has_label_p=0.9 ) - + ## Required for Forest model inference for result_entry in train: result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt'] result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt'] - + result_entry['data']['start_place']=ObjectId() + result_entry['data']['end_place']=ObjectId() ts.bulk_insert(train) # confirm data write did not fail check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None) From 450094ce76bb950217d5355a8e038c2535889ede Mon Sep 17 00:00:00 2001 From: $aTyam Date: Fri, 22 Mar 2024 04:21:33 -0400 Subject: [PATCH 27/28] [TESTED] Updated ForestModelLoadAndSave.py Forst model specific values added in test setup for random Trips --- emission/tests/modellingTests/TestForestModelLoadandSave.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py index 37768a689..38610fad4 100644 --- a/emission/tests/modellingTests/TestForestModelLoadandSave.py +++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py @@ -67,6 +67,10 @@ def setUp(self): has_label_p=self.has_label_percent ) + for result_entry in test_data: + result_entry['data']['start_local_dt']=result_entry['metadata']['write_local_dt'] + result_entry['data']['end_local_dt']=result_entry['metadata']['write_local_dt'] + ts.bulk_insert(test_data) # confirm data write did not fail From ad968def40991bfa88b02e52c8413cd8926e6a1d Mon Sep 17 00:00:00 2001 From: $aTyam Date: Thu, 28 Mar 2024 09:54:12 -0400 Subject: [PATCH 28/28] Fixing TestForestModelLoadandSave.py test in testRandomForestTypePreservation as using `allceodata` specific user id. The tests on github use different db. Fixed by generating random samples. --- .../tests/modellingTests/TestForestModelLoadandSave.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py index 38610fad4..079bc908b 100644 --- a/emission/tests/modellingTests/TestForestModelLoadandSave.py +++ b/emission/tests/modellingTests/TestForestModelLoadandSave.py @@ -36,7 +36,6 @@ def setUp(self): # $clustered_trips * $has_label_percent > self.min_trips # must be correct or else this test could fail under some random test cases. - # for a negative test, below self.unused_user_id = 'asdjfkl;asdfjkl;asd08234ur13fi4jhf2103mkl' # Ensuring that no previous test data was left in DB after teardown, @@ -271,13 +270,10 @@ def testRandomForestTypePreservation(self): TypePreservationTest: To ensure that the serialization and deserialization process maintains the data types of all model attributes. The model is trained, preditions stored, serialised and then desserialized. - The type of deserialised model attributes and the predictions of this must mast initial - serialised model. + The type of deserialised model attributes and the predictions of this must match + those of initial model. """ - ## Get trips for a user - test_user=uuid.UUID('feb6a3a8-a2ef-4f4a-8754-bd79f7154495') - ct_entry=eamtr._get_training_data(test_user,None) - + ct_entry=eamtr._get_training_data(self.user_id,None) split= int(len(ct_entry)*0.8) trips=ct_entry[:split] test_trips=ct_entry[split:]