From aa8cfd6d559b94a53c5dae2c90a8ee2824701441 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Sun, 20 Aug 2023 14:03:06 -0400 Subject: [PATCH 01/10] Moving Dependence from tour_model to trip_model The following changes support e-mission-server-eval-private's TRB_label_assist, reducing dependence on custom branch. --- .../analysis/modelling/similarity/similarity_metric.py | 10 ++++++++-- .../modelling/trip_model/greedy_similarity_binning.py | 9 ++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index 6be00216f..7a88f9da7 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -27,15 +27,21 @@ def similarity(self, a: List[float], b: List[float]) -> List[float]: """ pass - def similar(self, a: List[float], b: List[float], thresh: float) -> bool: + def similar(self, a: List[float], b: List[float], thresh: float, clusteringWay :str = 'origin-destination') -> bool: """compares the features, returning true if they are similar within some threshold :param a: features for a trip :param b: features for another trip :param thresh: threshold for similarity + :clusterinWay: clustering based on origin/destination/origin-destination-pair :return: true if the feature similarity is within some threshold """ similarity_values = self.similarity(a, b) - is_similar = all(map(lambda sim: sim <= thresh, similarity_values)) + if clusteringWay == 'origin': + is_similar = similarity_values[0] <= thresh + elif clusteringWay == 'destination': + is_similar = similarity_values[1] <= thresh + else: + is_similar = all(map(lambda sim: sim <= thresh, similarity_values)) return is_similar diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index d750a451e..34157dee3 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -119,6 +119,11 @@ class label to apply: self.sim_thresh = config['similarity_threshold_meters'] self.apply_cutoff = config['apply_cutoff'] self.is_incremental = config['incremental_evaluation'] + if config.get('clustering_way') is None: + self.clusteringWay='origin-destination' # previous default + else: + self.clusteringWay= config['clustering_way'] + self.tripLabels=[] self.bins: Dict[str, Dict] = {} @@ -184,9 +189,11 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]): logging.debug(f"adding trip to bin {bin_id} with features {trip_features}") self.bins[bin_id]['feature_rows'].append(trip_features) self.bins[bin_id]['labels'].append(trip_labels) + self.tripLabels.append(bin_id) else: # create new bin new_bin_id = str(len(self.bins)) + self.tripLabels.append(new_bin_id) new_bin_record = { 'feature_rows': [trip_features], 'labels': [trip_labels], @@ -204,7 +211,7 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]: :return: the id of a bin if a match was found, otherwise None """ for bin_id, bin_record in self.bins.items(): - matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh) + matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh,self.clusteringWay) for bin_sample in bin_record['feature_rows']]) if matches_bin: return bin_id From d9b4f7073c03838e01e5ee5d416aa6d462f0441e Mon Sep 17 00:00:00 2001 From: $aTyam Date: Thu, 24 Aug 2023 02:07:25 -0400 Subject: [PATCH 02/10] Generalising similarity calculations Moved the `clusteringWay` based decision making while binning further upstream, thus generalising `similar` (in `similarity_metrics.py`) and `similarity` ( in `od_similarity.py`) functions. Can now be used across modules without the need for `clusteringWay` parameter. --- .../modelling/similarity/od_similarity.py | 16 +++++++++++++--- .../modelling/similarity/similarity_metric.py | 14 +++++--------- .../trip_model/greedy_similarity_binning.py | 18 +++++++++++++----- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py index 3b84bd764..162b6f123 100644 --- a/emission/analysis/modelling/similarity/od_similarity.py +++ b/emission/analysis/modelling/similarity/od_similarity.py @@ -16,6 +16,16 @@ def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: return ctfe.od_features(trip) def similarity(self, a: List[float], b: List[float]) -> List[float]: - o_dist = ecc.calDistance([a[0], a[1]], [b[0], b[1]]) - d_dist = ecc.calDistance([a[2], a[3]], [b[2], b[3]]) - return [o_dist, d_dist] \ No newline at end of file + """ + a : a list of point features that can take either of two forms + 1. [point1_latitude,point1_longitude] + 2. [point1_latitude,point1_longitude,point2_latitude,point2_longitude] + + b : a list of point features that can take either of two forms + 1. [point1_latitude,point1_longitude] + 2. [point1_latitude,point1_longitude,point2_latitude,point2_longitude] + """ + point_dist = [ecc.calDistance(a[i:i+2], b[i:i+2]) + for i in range (0,len(a),2)] + + return point_dist \ No newline at end of file diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index 7a88f9da7..e9c645e59 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -27,21 +27,17 @@ def similarity(self, a: List[float], b: List[float]) -> List[float]: """ pass - def similar(self, a: List[float], b: List[float], thresh: float, clusteringWay :str = 'origin-destination') -> bool: + def similar(self, a: List[float], b: List[float], thresh: float) -> bool: """compares the features, returning true if they are similar within some threshold - :param a: features for a trip + :param a: features for a trip , :param b: features for another trip :param thresh: threshold for similarity - :clusterinWay: clustering based on origin/destination/origin-destination-pair + :clusteringWay: clustering based on origin/destination/origin-destination-pair :return: true if the feature similarity is within some threshold """ similarity_values = self.similarity(a, b) - if clusteringWay == 'origin': - is_similar = similarity_values[0] <= thresh - elif clusteringWay == 'destination': - is_similar = similarity_values[1] <= thresh - else: - is_similar = all(map(lambda sim: sim <= thresh, similarity_values)) + is_similar = all(sim <= thresh for sim in similarity_values) + return is_similar diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 34157dee3..efcce4f02 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -207,14 +207,22 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]: finds an existing bin where all bin features are "similar" to the incoming trip features. - :param trip_features: feature row for the incoming trip + :param trip_features: feature row for the incoming trip. + takes the form [orig_lat, orig_lon, dest_lat, dest_lon] :return: the id of a bin if a match was found, otherwise None """ for bin_id, bin_record in self.bins.items(): - matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh,self.clusteringWay) - for bin_sample in bin_record['feature_rows']]) - if matches_bin: - return bin_id + if self.clusteringWay == 'origin': + start,end=0,2 #since first two features in trip_features are for origin + elif self.clusteringWay == 'destination': + start,end=2,4 #third and fourth values intrip_features are for destination + elif self.clusteringWay == 'origin-destination': + start,end=0,4 #when clusteromgWay is 'origin-destination',we pass all four features + + matches_bin = all([self.metric.similar(trip_features[start:end], bin_sample[start:end], self.sim_thresh) + for bin_sample in bin_record['feature_rows']]) + if matches_bin: + return bin_id return None def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optional[Dict]]: From b5595c9b409e23a35e7c00792adebe806ad928c4 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Thu, 24 Aug 2023 02:31:41 -0400 Subject: [PATCH 03/10] Minor Comment fixes Comment fixes for better readability. --- .../analysis/modelling/similarity/od_similarity.py | 11 +++++++++-- .../modelling/similarity/similarity_metric.py | 3 +-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py index 162b6f123..9a6a49d0d 100644 --- a/emission/analysis/modelling/similarity/od_similarity.py +++ b/emission/analysis/modelling/similarity/od_similarity.py @@ -22,9 +22,16 @@ def similarity(self, a: List[float], b: List[float]) -> List[float]: 2. [point1_latitude,point1_longitude,point2_latitude,point2_longitude] b : a list of point features that can take either of two forms - 1. [point1_latitude,point1_longitude] - 2. [point1_latitude,point1_longitude,point2_latitude,point2_longitude] + 1. [point3_latitude,point3_longitude] + 2. [point3_latitude,point3_longitude,point4_latitude,point4_longitude] + + It'll always take the same form as parameter a. + + return: a list of size 1 ([distance between point1-point3]) if a and b take form 1 + or of size 2 ([distance between point1-point3, distance between point2-point4]) + if a and b take form 2. """ + point_dist = [ecc.calDistance(a[i:i+2], b[i:i+2]) for i in range (0,len(a),2)] diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index e9c645e59..1b520318f 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -31,10 +31,9 @@ def similar(self, a: List[float], b: List[float], thresh: float) -> bool: """compares the features, returning true if they are similar within some threshold - :param a: features for a trip , + :param a: features for a trip :param b: features for another trip :param thresh: threshold for similarity - :clusteringWay: clustering based on origin/destination/origin-destination-pair :return: true if the feature similarity is within some threshold """ similarity_values = self.similarity(a, b) From 710d1a5791212b540b883dedd5613a562071edc9 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Thu, 31 Aug 2023 02:10:39 -0400 Subject: [PATCH 04/10] Upgrading tests in `TestGreedySimilarityBinning.py` Tests created to confirm configuration for trip clustering (origin, destination and origin-destination) work as expected inside the GreedySimilarityBinning class in `greedy_similarity_binning.py` file. In order to upgrade old tests, `generate_mock_trips` in `modellingTestAssets.py` was also changed. Previously, out of the n trips generated, m had both origin and destination either inside or outside threshold,thus allowing only 2 configs. Now, 4 configurations are possible, one among origin OR destination OR origin-and-destination or Neither-origin-nor-destination. Default is set to 'origin-and-destination' since this was the old default. --- .../TestGreedySimilarityBinning.py | 416 +++++++++++++++++- .../modellingTests/modellingTestAssets.py | 16 +- 2 files changed, 419 insertions(+), 13 deletions(-) diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index 32bed47aa..620f2cf99 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -10,10 +10,16 @@ def setUp(self) -> None: logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.DEBUG) - def testBinning(self): + def testNoBinning(self): """ - when $should_be_grouped trips are the same, they should appear in a bin + Tests the three (origin, destination and origin-destination based) + binning configuration for trips. + + When both the origin and destination points of trips are outside a threshold + limit, none of the trips should be binned with the other in any of the three + configs (origin, destination or origin-and-destination based). """ + label_data = { "mode_confirm": ['walk', 'bike', 'transit'], "purpose_confirm": ['work', 'home', 'school'], @@ -24,31 +30,414 @@ def testBinning(self): # within a radius that should have them binned. n = 20 m = 5 + + # trip_part: when mock trips are generated, coordinates of this part of + # m trips will be within the threshold. trip_part can take one + # among the four values: + # + # 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie + # within the mentioned threshold when trips are generated), + # + # 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned + # threshold when trips are generated), + # + # 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the + # mentioned threshold when trips are generated) + # + # 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips + # will lie within the mentioned threshold when trips are generated) + trips = etmm.generate_mock_trips( user_id="joe", trips=n, origin=(0, 0), - destination=(1, 1), + destination=(1, 1), + trip_part='__', label_data=label_data, within_threshold=m, threshold=0.001, # ~ 111 meters in degrees WGS84 ) + - # pass in a test configuration to the binning algorithm - model_config = { + # pass in a test configuration to the binning algorithm. + # + # clustering_way : Part of the trip used for checking pairwise proximity. + # Can take one of the three values: + # + # 1. 'origin' -> using origin of the trip to check if 2 points + # lie within the mentioned similarity_threshold_meters + # 2. 'destination' -> using destination of the trip to check if 2 points + # lie within the mentioned similarity_threshold_meters + # 3. 'origin-destination' -> both origin and destination of the trip to check + # if 2 points lie within the mentioned + # similarity_threshold_meters + + model1_config = { "metric": "od_similarity", - "similarity_threshold_meters": 500, # meters, + "similarity_threshold_meters": 111, # meters, "apply_cutoff": False, + "clustering_way": "origin", "incremental_evaluation": False } - model = eamtg.GreedySimilarityBinning(model_config) + model1 = eamtg.GreedySimilarityBinning(model1_config) + model1.fit(trips) + + + model2_config = { + "metric": "od_similarity", + "similarity_threshold_meters":111, # meters, + "apply_cutoff": False, + "clustering_way": "destination", + "incremental_evaluation": False + } + model2 = eamtg.GreedySimilarityBinning(model2_config) + model2.fit(trips) + + + model3_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 111, # meters, + "apply_cutoff": False, + "clustering_way": "origin-destination", + "incremental_evaluation": False + } + model3 = eamtg.GreedySimilarityBinning(model3_config) + model3.fit(trips) + + # Since neither the origin nor the destination of the points generated lie + # within the threshold, there should be no binning at all. All the bins should + # have size 1. + + at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model1.bins.values())) + self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") + + at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model2.bins.values())) + self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") + + at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model3.bins.values())) + self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") + + + + def testBinningByOrigin(self): + """ + Tests the 'origin' based binning method for trips. + + When only the origin points of trips are within a threshold + limit, trips must be binned together that too if binned based on + 'origins', otherwise no binning. + """ + + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } + + # generate $n trips. $m of them should have origin and destinations sampled + # within a radius that should have them binned. + n = 20 + m = 5 + + # trip_part: when mock trips are generated, coordinates of this part of + # m trips will be within the threshold. trip_part can take one + # among the four values: + # + # 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie + # within the mentioned threshold when trips are generated), + # + # 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned + # threshold when trips are generated), + # + # 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the + # mentioned threshold when trips are generated) + # + # 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips + # will lie within the mentioned threshold when trips are generated) + + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + trip_part='o_', + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + + # pass in a test configuration to the binning algorithm. + # + # clustering_way : Part of the trip used for checking pairwise proximity. + # Can take one of the three values: + # + # 1. 'origin' -> using origin of the trip to check if 2 points + # lie within the mentioned similarity_threshold_meters + # 2. 'destination' -> using destination of the trip to check if 2 points + # lie within the mentioned similarity_threshold_meters + # 3. 'origin-destination' -> both origin and destination of the trip to check + # if 2 points lie within the mentioned + # similarity_threshold_meters + + model1_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 111, # meters, + "apply_cutoff": False, + "clustering_way": "origin", + "incremental_evaluation": False + } + model1 = eamtg.GreedySimilarityBinning(model1_config) + model1.fit(trips) + + + model2_config = { + "metric": "od_similarity", + "similarity_threshold_meters":111, # meters, + "apply_cutoff": False, + "clustering_way": "destination", + "incremental_evaluation": False + } + model2 = eamtg.GreedySimilarityBinning(model2_config) + model2.fit(trips) + + + model3_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 111, # meters, + "apply_cutoff": False, + "clustering_way": "origin-destination", + "incremental_evaluation": False + } + model3 = eamtg.GreedySimilarityBinning(model3_config) + model3.fit(trips) + + + # Since only the origin of the points generated lies within the threshold, + # there should be binning only when 'origin' config is used. Otherwise all + # the bins should have size 1. + + at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model1.bins.values())) + self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") + + at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) ==1, model2.bins.values())) + self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") + + at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model3.bins.values())) + self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") + + + + def testBinningByDestination(self): + """ + Tests the 'destination' based binning method for trips. + + When only the destination points of trips are within a threshold + limit, trips must be binned together that too if binned based on + 'destination', otherwise no binning. + """ + + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } + + # generate $n trips. $m of them should have origin and destinations sampled + # within a radius that should have them binned. + n = 20 + m = 5 + + # trip_part: when mock trips are generated, coordinates of this part of + # m trips will be within the threshold. trip_part can take one + # among the four values: + # + # 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie + # within the mentioned threshold when trips are generated), + # + # 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned + # threshold when trips are generated), + # + # 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the + # mentioned threshold when trips are generated) + # + # 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips + # will lie within the mentioned threshold when trips are generated) + + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + trip_part='_d', + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + + # pass in a test configuration to the binning algorithm. + # + # clustering_way : Part of the trip used for checking pairwise proximity. + # Can take one of the three values: + # + # 1. 'origin' -> using origin of the trip to check if 2 points + # lie within the mentioned similarity_threshold_meters + # 2. 'destination' -> using destination of the trip to check if 2 points + # lie within the mentioned similarity_threshold_meters + # 3. 'origin-destination' -> both origin and destination of the trip to check + # if 2 points lie within the mentioned + # similarity_threshold_meters - model.fit(trips) + model1_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 111, # meters, + "apply_cutoff": False, + "clustering_way": "origin", + "incremental_evaluation": False + } + model1 = eamtg.GreedySimilarityBinning(model1_config) + model1.fit(trips) + - # $m trip features should appear together in one bin - at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values())) + model2_config = { + "metric": "od_similarity", + "similarity_threshold_meters":111, # meters, + "apply_cutoff": False, + "clustering_way": "destination", + "incremental_evaluation": False + } + model2 = eamtg.GreedySimilarityBinning(model2_config) + model2.fit(trips) + + + model3_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 111, # meters, + "apply_cutoff": False, + "clustering_way": "origin-destination", + "incremental_evaluation": False + } + model3 = eamtg.GreedySimilarityBinning(model3_config) + model3.fit(trips) + + # Since only the destination of the points generated lies within the threshold, + # there should be binning only when 'destination' config is used. Otherwise all + # the bins should have size 1. + + at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model1.bins.values())) self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") + at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) ==m, model2.bins.values())) + self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") + + at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model3.bins.values())) + self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") + + + def testBinningByOriginAndDestination(self): + """ + Tests the 'origin-destination' based binning method for trips. + + When both the origin and destination points of trips are within + a threshold limit, trips will be binned together in all three (origin , + destination, origin-and-destinaiton) configurations. + """ + + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } + + # generate $n trips. $m of them should have origin and destinations sampled + # within a radius that should have them binned. + n = 20 + m = 5 + + # trip_part: when mock trips are generated, coordinates of this part of + # m trips will be within the threshold. trip_part can take one + # among the four values: + # + # 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie + # within the mentioned threshold when trips are generated), + # + # 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned + # threshold when trips are generated), + # + # 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the + # mentioned threshold when trips are generated) + # + # 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips + # will lie within the mentioned threshold when trips are generated) + + trips = etmm.generate_mock_trips( + user_id="joe", + trips=n, + origin=(0, 0), + destination=(1, 1), + trip_part='od', + label_data=label_data, + within_threshold=m, + threshold=0.001, # ~ 111 meters in degrees WGS84 + ) + + # pass in a test configuration to the binning algorithm. + # + # clustering_way : Part of the trip used for checking pairwise proximity. + # Can take one of the three values: + # + # 1. 'origin' -> using origin of the trip to check if 2 points + # lie within the mentioned similarity_threshold_meters + # 2. 'destination' -> using destination of the trip to check if 2 points + # lie within the mentioned similarity_threshold_meters + # 3. 'origin-destination' -> both origin and destination of the trip to check + # if 2 points lie within the mentioned + # similarity_threshold_meters + + model1_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 111, # meters, + "apply_cutoff": False, + "clustering_way": "origin", + "incremental_evaluation": False + } + model1 = eamtg.GreedySimilarityBinning(model1_config) + model1.fit(trips) + + + model2_config = { + "metric": "od_similarity", + "similarity_threshold_meters":111, # meters, + "apply_cutoff": False, + "clustering_way": "destination", + "incremental_evaluation": False + } + model2 = eamtg.GreedySimilarityBinning(model2_config) + model2.fit(trips) + + + model3_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 111, # meters, + "apply_cutoff": False, + "clustering_way": "origin-destination", + "incremental_evaluation": False + } + model3 = eamtg.GreedySimilarityBinning(model3_config) + model3.fit(trips) + + # Since both the origin and the destination points of the generated trips lie + # within the threshold, there should be binning in all three configs. + + at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model1.bins.values())) + self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") + + at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) ==m, model2.bins.values())) + self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") + + at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model3.bins.values())) + self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") + + def testPrediction(self): """ training and testing with similar trips should lead to a positive bin match @@ -64,7 +453,8 @@ def testPrediction(self): user_id="joe", trips=n, origin=(0, 0), - destination=(1, 1), + destination=(1, 1), + trip_part='od', label_data=label_data, threshold=0.001, # ~ 111 meters in degrees WGS84 ) @@ -73,6 +463,7 @@ def testPrediction(self): "metric": "od_similarity", "similarity_threshold_meters": 500, # meters, "apply_cutoff": False, + "clustering_way": "origin-destination", "incremental_evaluation": False } model = eamtg.GreedySimilarityBinning(model_config) @@ -102,6 +493,7 @@ def testNoPrediction(self): trips=n, origin=(39.7645187, -104.9951944), # Denver, CO destination=(39.7435206, -105.2369292), # Golden, CO + trip_part='od', label_data=label_data, threshold=0.001, # ~ 111 meters in degrees WGS84 ) @@ -110,6 +502,7 @@ def testNoPrediction(self): trips=1, origin=(61.1042262, -150.5611644), # Anchorage, AK destination=(62.2721466, -150.3233046), # Talkeetna, AK + trip_part='od', label_data=label_data, threshold=0.001, # ~ 111 meters in degrees WGS84 ) @@ -118,6 +511,7 @@ def testNoPrediction(self): "metric": "od_similarity", "similarity_threshold_meters": 500, # meters, "apply_cutoff": False, + "clustering_way": "origin-destination", "incremental_evaluation": False } model = eamtg.GreedySimilarityBinning(model_config) diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 879a3a2ca..de9b26cf4 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -131,6 +131,7 @@ def generate_mock_trips( trips, origin, destination, + trip_part='od', label_data = None, within_threshold = None, start_ts: None = None, @@ -159,6 +160,17 @@ def generate_mock_trips( :param trips: number of trips :param origin: origin coordinates :param destination: destination coordinates + :param trip_part: when mock trips are generated, coordinates of this part of + the trips will be within the threshold. trip_part can take one + among the four values: + 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie + within the mentioned threshold when trips are generated), + 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned + threshold when trips are generated), + 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the + mentioned threshold when trips are generated) + 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips + will lie within the mentioned threshold when trips are generated) :param label_data: dictionary of label data, see above, defaults to None :param within_threshold: number of trips that should fall within the provided distance threshold in degrees WGS84, defaults to None @@ -175,8 +187,8 @@ def generate_mock_trips( trips_within_threshold = [i < within for i in range(trips)] result = [] for within in trips_within_threshold: - o = generate_trip_coordinates(origin, within, threshold, max) - d = generate_trip_coordinates(destination, within, threshold, max) + o = generate_trip_coordinates(origin, (trip_part[0] == 'o' and within), threshold, max) + d = generate_trip_coordinates(destination, (trip_part[1] == 'd' and within), threshold, max) labels = {} if label_data is None or random.random() > has_label_p \ else sample_trip_labels( mode_labels=label_data.get('mode_confirm'), From 6d9ea7786142f7e10a30ab4be5b326b6634248bd Mon Sep 17 00:00:00 2001 From: $aTyam Date: Thu, 31 Aug 2023 15:28:14 -0400 Subject: [PATCH 05/10] Testing upgraded `Similarity` functionality Checking `Similarity` behaves as expected when list of size 2 ( for only origin OR only destination ) or size 4 (for origin AND destination) are passed. --- .../modellingTests/TestBackwardsCompat.py | 5 + .../TestRunGreedyIncrementalModel.py | 2 + .../modellingTests/TestRunGreedyModel.py | 3 + .../modellingTests/TestSimilarityMetric.py | 95 ++++++++++++++++--- .../modellingTests/modellingTestAssets.py | 2 +- 5 files changed, 92 insertions(+), 15 deletions(-) diff --git a/emission/tests/modellingTests/TestBackwardsCompat.py b/emission/tests/modellingTests/TestBackwardsCompat.py index b81b5f529..c3cba4fae 100644 --- a/emission/tests/modellingTests/TestBackwardsCompat.py +++ b/emission/tests/modellingTests/TestBackwardsCompat.py @@ -59,6 +59,7 @@ def testAnyVsAllWhilePredicting(self): "metric": "od_similarity", "similarity_threshold_meters": 16000, # meters, "apply_cutoff": False, + "clustering_way": 'origin-destination', "incremental_evaluation": False } new_builder = eamtg.GreedySimilarityBinning(model_config) @@ -96,6 +97,7 @@ def testRandomTripsWithinTheSameThreshold(self): trips=n, origin=(0, 0), destination=(1, 1), + trip_part='od', label_data=label_data, threshold=0.001, # ~ 111 meters in degrees WGS84 ) @@ -113,6 +115,7 @@ def testRandomTripsWithinTheSameThreshold(self): "metric": "od_similarity", "similarity_threshold_meters": 500, # meters, "apply_cutoff": False, + "clustering_way": 'origin-destination', "incremental_evaluation": False } new_model = eamtg.GreedySimilarityBinning(model_config) @@ -156,6 +159,7 @@ def testRandomTripsOutsideTheSameThreshold(self): trips=n, origin=(0, 0), destination=(1, 1), + trip_part='od', label_data=label_data, threshold=0.1, # Much bigger than the 500m threshold, so we will get multiple bins ) @@ -173,6 +177,7 @@ def testRandomTripsOutsideTheSameThreshold(self): "metric": "od_similarity", "similarity_threshold_meters": 500, # meters, "apply_cutoff": False, + "clustering_way": 'origin-destination', "incremental_evaluation": False } new_model = eamtg.GreedySimilarityBinning(model_config) diff --git a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py index aee6a6f09..1529f8df5 100644 --- a/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py +++ b/emission/tests/modellingTests/TestRunGreedyIncrementalModel.py @@ -44,6 +44,7 @@ def setUp(self): "metric": "od_similarity", "similarity_threshold_meters": sim_threshold, "apply_cutoff": False, + "clustering_way": 'origin-destination', "incremental_evaluation": True } @@ -162,6 +163,7 @@ def testIncrementalRun(self): trips=self.new_trips_per_invocation, origin=self.origin, destination=self.destination, + trip_part='od', label_data=label_data, threshold=0.0001, # ~10m, start_ts=time.time() - 20, diff --git a/emission/tests/modellingTests/TestRunGreedyModel.py b/emission/tests/modellingTests/TestRunGreedyModel.py index 10f221909..9e4431fa3 100644 --- a/emission/tests/modellingTests/TestRunGreedyModel.py +++ b/emission/tests/modellingTests/TestRunGreedyModel.py @@ -62,6 +62,7 @@ def setUp(self): trips=self.total_trips, origin=self.origin, destination=self.destination, + trip_part='od', label_data=label_data, within_threshold=self.clustered_trips, threshold=0.004, # ~400m @@ -106,6 +107,7 @@ def testTrainGreedyModelWithZeroTrips(self): "metric": "od_similarity", "similarity_threshold_meters": 500, "apply_cutoff": False, + "clustering_way": 'origin-destination', "incremental_evaluation": False } @@ -142,6 +144,7 @@ def test1RoundTripGreedySimilarityBinning(self): "metric": "od_similarity", "similarity_threshold_meters": 500, "apply_cutoff": False, + "clustering_way": 'origin-destination', "incremental_evaluation": False } diff --git a/emission/tests/modellingTests/TestSimilarityMetric.py b/emission/tests/modellingTests/TestSimilarityMetric.py index ae37fc39a..f7c7b195f 100644 --- a/emission/tests/modellingTests/TestSimilarityMetric.py +++ b/emission/tests/modellingTests/TestSimilarityMetric.py @@ -6,26 +6,93 @@ class TestSimilarityMetric(unittest.TestCase): def testODsAreSimilar(self): generate_points_thresh = 0.001 # approx. 111 meters - similarity_threshold = 500 # - # random, but, points are sampled within a circle and should always be < sim threshold - trips = etmm.generate_mock_trips('bob', 2, [0, 0], [1, 1], threshold=generate_points_thresh) + similarity_threshold = 111 # + metric = eamso.OriginDestinationSimilarity() + ## Sub-Test 1 - 3 : + # random, but, origin and destination points are sampled within a circle and should always be < sim threshold + # Since both origin and destination poitns lie within threshold limits,they should be similar + # when we check by just origin or just destination or both origin-and-destination + + trips = etmm.generate_mock_trips('bob', 2, [0, 0], [1, 1], 'od',threshold=generate_points_thresh) coords0 = metric.extract_features(trips[0]) - coords1 = metric.extract_features(trips[1]) - similar = metric.similar(coords0, coords1, similarity_threshold) - self.assertTrue(similar) + coords1 = metric.extract_features(trips[1]) + similarOD1 = metric.similar(coords0, coords1, similarity_threshold) + similarOD2 = metric.similar(coords0[:2], coords1[:2], similarity_threshold) + similarOD3 = metric.similar(coords0[2:], coords1[2:], similarity_threshold) + + ## Sub-Test 4 : + # random, but, only origin points are sampled within a circle and should always be < sim threshold + # Since origin of two points lies within threshold limits,they should be similar + # when we check just origin for similarity. + + + trips = etmm.generate_mock_trips('alice', 2, [0, 0], [1, 1], 'o_',threshold=generate_points_thresh) + coords0 = metric.extract_features(trips[0])[:2] + coords1 = metric.extract_features(trips[1])[:2] + similarO = metric.similar(coords0, coords1, similarity_threshold) + + ##Sub-Test 5 : + # random, but, only destination points are sampled within a circle and should always be < sim threshold + # Since destination of two points lies within threshold limits,they should be similar + # when we check just destination for similarity. + + trips = etmm.generate_mock_trips('Caty', 2, [0, 0], [1, 1], '_d',threshold=generate_points_thresh) + coords0 = metric.extract_features(trips[0])[2:] + coords1 = metric.extract_features(trips[1])[2:] + similarD = metric.similar(coords0, coords1, similarity_threshold) + + # All the similars must be true + self.assertTrue(similarOD1) # RESULT SUB-TEST 1 + self.assertTrue(similarOD2) # RESULT SUB-TEST 2 + self.assertTrue(similarOD3) # RESULT SUB-TEST 3 + self.assertTrue(similarO) # RESULT SUB-TEST 4 + self.assertTrue(similarD) # RESULT SUB-TEST 5 def testODsAreNotSimilar(self): generate_points_thresh = 0.001 # approx. 111 meters - similarity_threshold = 500 # - - trips0 = etmm.generate_mock_trips('bob', 1, [0, 0], [1, 1], threshold=generate_points_thresh) - trips1 = etmm.generate_mock_trips('alice', 1, [2, 2], [3, 3], threshold=generate_points_thresh) + similarity_threshold = 111 # + metric = eamso.OriginDestinationSimilarity() + + ## Sub-Test 1-2: + # Two trips with neither origin nor destination coordinates within threshold + # must not be similar in any configuration of similarity testing. + trips = etmm.generate_mock_trips('bob', 2, [0, 0], [1, 1], '__', threshold=generate_points_thresh) + coords0 = metric.extract_features(trips[0]) + coords1 = metric.extract_features(trips[1]) + similar11 = metric.similar(coords0[:2], coords1[:2], similarity_threshold) + similar12 = metric.similar(coords0[2:], coords1[:], similarity_threshold) + + ## Sub-Test 3-4: + # Two trips with origin coordinates within threshold but we check + # similarity using destination coordinates or origin-and-destination + # should not be similar. + trips = etmm.generate_mock_trips('Alice', 2, [2, 2], [3, 3], 'o_', threshold=generate_points_thresh) metric = eamso.OriginDestinationSimilarity() - coords0 = metric.extract_features(trips0[0]) - coords1 = metric.extract_features(trips1[0]) - similar = metric.similar(coords0, coords1, similarity_threshold) - self.assertFalse(similar) + coords0 = metric.extract_features(trips[0]) + coords1 = metric.extract_features(trips[1]) + similar21 = metric.similar(coords0[2:], coords1[2:], similarity_threshold) + similar22 = metric.similar(coords0, coords1, similarity_threshold) + + ## Sub-Test 5-6: + # Two trips with destination coordinates within threshold but we check + # similarity using origin coordinates or origin-and-destination + # should not be similar. + trips = etmm.generate_mock_trips('Caty', 2, [3, 3], [4, 4], '_d', threshold=generate_points_thresh) + metric = eamso.OriginDestinationSimilarity() + coords0 = metric.extract_features(trips[0]) + coords1 = metric.extract_features(trips[1]) + similar31 = metric.similar(coords0[:2], coords1[:2], similarity_threshold) + similar32 = metric.similar(coords0, coords1, similarity_threshold) + + # All the similars must be False + self.assertFalse(similar11) # RESULT SUB-TEST 1 + self.assertFalse(similar12) # RESULT SUB-TEST 2 + self.assertFalse(similar21) # RESULT SUB-TEST 3 + self.assertFalse(similar22) # RESULT SUB-TEST 4 + self.assertFalse(similar31) # RESULT SUB-TEST 5 + self.assertFalse(similar32) # RESULT SUB-TEST 6 + if __name__ == '__main__': unittest.main() diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index de9b26cf4..cb886670c 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -211,6 +211,6 @@ def generate_mock_trips( "purpose_confirm": ['work', 'home', 'school'], "replaced_mode": ['walk', 'bike', 'drive'] } - result = generate_mock_trips('joe-bob', 14, [0, 0], [1,1], label_data, 6) + result = generate_mock_trips('joe-bob', 14, [0, 0], [1,1],'od', label_data, 6) for r in result: print(r) \ No newline at end of file From 7f6d7542b5561861e5edfd0e27435cb11564078a Mon Sep 17 00:00:00 2001 From: $aTyam Date: Tue, 5 Sep 2023 16:02:28 -0400 Subject: [PATCH 06/10] Correcting logic behind binning and non-binning while testing 1. improved logic based on this comment . https://github.com/e-mission/e-mission-server/pull/933/commits/710d1a5791212b540b883dedd5613a562071edc9#r1314065502 2.Created a utilities file for repetitive code required by multiple files. 3. clustering threshold back to 500 4. More in-code comments. --- .../TestGreedySimilarityBinning.py | 505 ++---------------- .../modellingTests/TestSimilarityMetric.py | 127 ++--- emission/tests/modellingTests/utilities.py | 68 +++ 3 files changed, 165 insertions(+), 535 deletions(-) create mode 100644 emission/tests/modellingTests/utilities.py diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index 620f2cf99..3e1cd78c2 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -1,6 +1,6 @@ import unittest import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg -import emission.tests.modellingTests.modellingTestAssets as etmm +import emission.tests.modellingTests.utilities as etmu import logging @@ -15,428 +15,60 @@ def testNoBinning(self): Tests the three (origin, destination and origin-destination based) binning configuration for trips. - When both the origin and destination points of trips are outside a threshold + When the origin and destination points of trips are outside a threshold limit, none of the trips should be binned with the other in any of the three configs (origin, destination or origin-and-destination based). """ - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive'] - } - - # generate $n trips. $m of them should have origin and destinations sampled - # within a radius that should have them binned. - n = 20 - m = 5 - - # trip_part: when mock trips are generated, coordinates of this part of - # m trips will be within the threshold. trip_part can take one - # among the four values: - # - # 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie - # within the mentioned threshold when trips are generated), - # - # 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned - # threshold when trips are generated), - # - # 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the - # mentioned threshold when trips are generated) - # - # 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips - # will lie within the mentioned threshold when trips are generated) - - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - trip_part='__', - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - - - # pass in a test configuration to the binning algorithm. - # - # clustering_way : Part of the trip used for checking pairwise proximity. - # Can take one of the three values: - # - # 1. 'origin' -> using origin of the trip to check if 2 points - # lie within the mentioned similarity_threshold_meters - # 2. 'destination' -> using destination of the trip to check if 2 points - # lie within the mentioned similarity_threshold_meters - # 3. 'origin-destination' -> both origin and destination of the trip to check - # if 2 points lie within the mentioned - # similarity_threshold_meters - - model1_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 111, # meters, - "apply_cutoff": False, - "clustering_way": "origin", - "incremental_evaluation": False - } - model1 = eamtg.GreedySimilarityBinning(model1_config) - model1.fit(trips) - - - model2_config = { - "metric": "od_similarity", - "similarity_threshold_meters":111, # meters, - "apply_cutoff": False, - "clustering_way": "destination", - "incremental_evaluation": False - } - model2 = eamtg.GreedySimilarityBinning(model2_config) - model2.fit(trips) - - - model3_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 111, # meters, - "apply_cutoff": False, - "clustering_way": "origin-destination", - "incremental_evaluation": False - } - model3 = eamtg.GreedySimilarityBinning(model3_config) - model3.fit(trips) - - # Since neither the origin nor the destination of the points generated lie - # within the threshold, there should be no binning at all. All the bins should - # have size 1. - - at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model1.bins.values())) - self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") - - at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model2.bins.values())) - self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") - - at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model3.bins.values())) - self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") - - - - def testBinningByOrigin(self): - """ - Tests the 'origin' based binning method for trips. - - When only the origin points of trips are within a threshold - limit, trips must be binned together that too if binned based on - 'origins', otherwise no binning. - """ - - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive'] - } - - # generate $n trips. $m of them should have origin and destinations sampled - # within a radius that should have them binned. - n = 20 - m = 5 - - # trip_part: when mock trips are generated, coordinates of this part of - # m trips will be within the threshold. trip_part can take one - # among the four values: - # - # 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie - # within the mentioned threshold when trips are generated), - # - # 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned - # threshold when trips are generated), - # - # 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the - # mentioned threshold when trips are generated) - # - # 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips - # will lie within the mentioned threshold when trips are generated) + # generate $n trips. + n = 20 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - trip_part='o_', - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) + #this generates 20 trips one-by-one, where each trip's respective origin and destination + # points are more than 500m away. + trips = [ etmu.setTripConfig(1, (i, i), (i+1, i+1), 'od', 1)[0] for i in range(n)] - # pass in a test configuration to the binning algorithm. - # - # clustering_way : Part of the trip used for checking pairwise proximity. - # Can take one of the three values: - # - # 1. 'origin' -> using origin of the trip to check if 2 points - # lie within the mentioned similarity_threshold_meters - # 2. 'destination' -> using destination of the trip to check if 2 points - # lie within the mentioned similarity_threshold_meters - # 3. 'origin-destination' -> both origin and destination of the trip to check - # if 2 points lie within the mentioned - # similarity_threshold_meters + # parameters passed for testing. A list, where each element is one way of clustering + clustering_ways_paramters= ["origin","destination","origin-destination"] - model1_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 111, # meters, - "apply_cutoff": False, - "clustering_way": "origin", - "incremental_evaluation": False - } - model1 = eamtg.GreedySimilarityBinning(model1_config) - model1.fit(trips) - - - model2_config = { - "metric": "od_similarity", - "similarity_threshold_meters":111, # meters, - "apply_cutoff": False, - "clustering_way": "destination", - "incremental_evaluation": False - } - model2 = eamtg.GreedySimilarityBinning(model2_config) - model2.fit(trips) - - - model3_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 111, # meters, - "apply_cutoff": False, - "clustering_way": "origin-destination", - "incremental_evaluation": False - } - model3 = eamtg.GreedySimilarityBinning(model3_config) - model3.fit(trips) - - - # Since only the origin of the points generated lies within the threshold, - # there should be binning only when 'origin' config is used. Otherwise all - # the bins should have size 1. - - at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model1.bins.values())) - self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") - - at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) ==1, model2.bins.values())) - self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") - - at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model3.bins.values())) - self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") - - - - def testBinningByDestination(self): - """ - Tests the 'destination' based binning method for trips. - - When only the destination points of trips are within a threshold - limit, trips must be binned together that too if binned based on - 'destination', otherwise no binning. + #Testing each of the three clustering_ways by passing them as parameters + for cw in clustering_ways_paramters: + with self.subTest(clustering_way=cw): + #initialise the binning model and fit with previously generated trips + model = etmu.setModelConfig("od_similarity", 500, False, cw, False) + model.fit(trips) + #check each bins for no of trips + no_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model.bins.values())) + #Since all trips were sampled outside the threshold, there should be no bin + # with more then 1 trip + self.assertTrue(no_large_bin,"no bin should have more than 1 features in it") + + def testBinning(self): """ + Tests the three (origin, destination and origin-destination based) + binning configuration for trips. - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive'] - } - - # generate $n trips. $m of them should have origin and destinations sampled - # within a radius that should have them binned. - n = 20 - m = 5 - - # trip_part: when mock trips are generated, coordinates of this part of - # m trips will be within the threshold. trip_part can take one - # among the four values: - # - # 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie - # within the mentioned threshold when trips are generated), - # - # 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned - # threshold when trips are generated), - # - # 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the - # mentioned threshold when trips are generated) - # - # 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips - # will lie within the mentioned threshold when trips are generated) - - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - trip_part='_d', - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - - # pass in a test configuration to the binning algorithm. - # - # clustering_way : Part of the trip used for checking pairwise proximity. - # Can take one of the three values: - # - # 1. 'origin' -> using origin of the trip to check if 2 points - # lie within the mentioned similarity_threshold_meters - # 2. 'destination' -> using destination of the trip to check if 2 points - # lie within the mentioned similarity_threshold_meters - # 3. 'origin-destination' -> both origin and destination of the trip to check - # if 2 points lie within the mentioned - # similarity_threshold_meters - - model1_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 111, # meters, - "apply_cutoff": False, - "clustering_way": "origin", - "incremental_evaluation": False - } - model1 = eamtg.GreedySimilarityBinning(model1_config) - model1.fit(trips) - - - model2_config = { - "metric": "od_similarity", - "similarity_threshold_meters":111, # meters, - "apply_cutoff": False, - "clustering_way": "destination", - "incremental_evaluation": False - } - model2 = eamtg.GreedySimilarityBinning(model2_config) - model2.fit(trips) - - - model3_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 111, # meters, - "apply_cutoff": False, - "clustering_way": "origin-destination", - "incremental_evaluation": False - } - model3 = eamtg.GreedySimilarityBinning(model3_config) - model3.fit(trips) - - # Since only the destination of the points generated lies within the threshold, - # there should be binning only when 'destination' config is used. Otherwise all - # the bins should have size 1. - - at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model1.bins.values())) - self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") - - at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) ==m, model2.bins.values())) - self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") - - at_least_one_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model3.bins.values())) - self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") - - - def testBinningByOriginAndDestination(self): + When the points lie within threshold ,the trips are binned together. """ - Tests the 'origin-destination' based binning method for trips. - - When both the origin and destination points of trips are within - a threshold limit, trips will be binned together in all three (origin , - destination, origin-and-destinaiton) configurations. - """ - - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive'] - } - - # generate $n trips. $m of them should have origin and destinations sampled + # generate $n trips. $m of them should have origin sampled # within a radius that should have them binned. n = 20 m = 5 - # trip_part: when mock trips are generated, coordinates of this part of - # m trips will be within the threshold. trip_part can take one - # among the four values: - # - # 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie - # within the mentioned threshold when trips are generated), - # - # 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned - # threshold when trips are generated), - # - # 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the - # mentioned threshold when trips are generated) - # - # 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips - # will lie within the mentioned threshold when trips are generated) - - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - trip_part='od', - label_data=label_data, - within_threshold=m, - threshold=0.001, # ~ 111 meters in degrees WGS84 - ) - - # pass in a test configuration to the binning algorithm. - # - # clustering_way : Part of the trip used for checking pairwise proximity. - # Can take one of the three values: - # - # 1. 'origin' -> using origin of the trip to check if 2 points - # lie within the mentioned similarity_threshold_meters - # 2. 'destination' -> using destination of the trip to check if 2 points - # lie within the mentioned similarity_threshold_meters - # 3. 'origin-destination' -> both origin and destination of the trip to check - # if 2 points lie within the mentioned - # similarity_threshold_meters - - model1_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 111, # meters, - "apply_cutoff": False, - "clustering_way": "origin", - "incremental_evaluation": False - } - model1 = eamtg.GreedySimilarityBinning(model1_config) - model1.fit(trips) - - - model2_config = { - "metric": "od_similarity", - "similarity_threshold_meters":111, # meters, - "apply_cutoff": False, - "clustering_way": "destination", - "incremental_evaluation": False - } - model2 = eamtg.GreedySimilarityBinning(model2_config) - model2.fit(trips) - - - model3_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 111, # meters, - "apply_cutoff": False, - "clustering_way": "origin-destination", - "incremental_evaluation": False - } - model3 = eamtg.GreedySimilarityBinning(model3_config) - model3.fit(trips) - - # Since both the origin and the destination points of the generated trips lie - # within the threshold, there should be binning in all three configs. - - at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model1.bins.values())) - self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") - - at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) ==m, model2.bins.values())) - self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") - - at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model3.bins.values())) - self.assertTrue(at_least_one_large_bin, "no bin should have more than 1 features in it") - + # parameters passed for testing. A list, where each element of this list takes the form + # [trip part to be sampled within mentioned threshold , clustering way used to check similarity] + parameters= [["o_",'origin'],["_d",'destination'],["od",'origin-destination']] + for tp,cw in parameters: + with self.subTest(trip_part=tp,clustering_way=cw): + #generate random trips using utilities + trips = etmu.setTripConfig(trips=n, org=(0, 0), dest=(1, 1), + trip_part=tp, within_thr=m) + #initialise the binning model and fit with previously generated trips + model = etmu.setModelConfig("od_similarity", 500, False, cw, False) + model.fit(trips) + #check each bins for no of trips + at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values())) + #Since 5 trips were sampled within the threshold, there should be one bin with 5 trips + self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") def testPrediction(self): """ @@ -449,24 +81,10 @@ def testPrediction(self): } n = 6 - trips = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(0, 0), - destination=(1, 1), - trip_part='od', - label_data=label_data, - threshold=0.001, # ~ 111 meters in degrees WGS84 + trips = etmu.setTripConfig(trips=n, org=(0, 0), dest=(1, 1), + trip_part='od', label_data=label_data, ) - - model_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 500, # meters, - "apply_cutoff": False, - "clustering_way": "origin-destination", - "incremental_evaluation": False - } - model = eamtg.GreedySimilarityBinning(model_config) + model = etmu.setModelConfig("od_similarity", 500, False, "origin-destination", False) train = trips[0:5] test = trips[5] @@ -486,36 +104,17 @@ def testNoPrediction(self): "purpose_confirm": ['pizza_party'], "replaced_mode": ['crabwalking'] } - n = 5 - train = etmm.generate_mock_trips( - user_id="joe", - trips=n, - origin=(39.7645187, -104.9951944), # Denver, CO - destination=(39.7435206, -105.2369292), # Golden, CO - trip_part='od', - label_data=label_data, - threshold=0.001, # ~ 111 meters in degrees WGS84 + + train = etmu.setTripConfig(trips=n, org=(39.7645187, -104.9951944), # Denver, CO + dest=(39.7435206, -105.2369292), # Golden, CO + trip_part='od', label_data=label_data ) - test = etmm.generate_mock_trips( - user_id="joe", - trips=1, - origin=(61.1042262, -150.5611644), # Anchorage, AK - destination=(62.2721466, -150.3233046), # Talkeetna, AK - trip_part='od', - label_data=label_data, - threshold=0.001, # ~ 111 meters in degrees WGS84 + test = etmu.setTripConfig(trips=n, org=(61.1042262, -150.5611644), # Denver, CO + dest=(62.2721466, -150.3233046), # Golden, CO + trip_part='od', label_data=label_data, ) - - model_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 500, # meters, - "apply_cutoff": False, - "clustering_way": "origin-destination", - "incremental_evaluation": False - } - model = eamtg.GreedySimilarityBinning(model_config) - + model = etmu.setModelConfig("od_similarity", 500, False, "origin-destination", False) model.fit(train) results, n = model.predict(test[0]) diff --git a/emission/tests/modellingTests/TestSimilarityMetric.py b/emission/tests/modellingTests/TestSimilarityMetric.py index f7c7b195f..cbe500b23 100644 --- a/emission/tests/modellingTests/TestSimilarityMetric.py +++ b/emission/tests/modellingTests/TestSimilarityMetric.py @@ -1,98 +1,61 @@ import unittest -import emission.tests.modellingTests.modellingTestAssets as etmm import emission.analysis.modelling.similarity.od_similarity as eamso +import emission.tests.modellingTests.utilities as etmu class TestSimilarityMetric(unittest.TestCase): def testODsAreSimilar(self): generate_points_thresh = 0.001 # approx. 111 meters - similarity_threshold = 111 # - + similarity_threshold = 500 # in meters metric = eamso.OriginDestinationSimilarity() - ## Sub-Test 1 - 3 : - # random, but, origin and destination points are sampled within a circle and should always be < sim threshold - # Since both origin and destination poitns lie within threshold limits,they should be similar - # when we check by just origin or just destination or both origin-and-destination - - trips = etmm.generate_mock_trips('bob', 2, [0, 0], [1, 1], 'od',threshold=generate_points_thresh) - coords0 = metric.extract_features(trips[0]) - coords1 = metric.extract_features(trips[1]) - similarOD1 = metric.similar(coords0, coords1, similarity_threshold) - similarOD2 = metric.similar(coords0[:2], coords1[:2], similarity_threshold) - similarOD3 = metric.similar(coords0[2:], coords1[2:], similarity_threshold) - - ## Sub-Test 4 : - # random, but, only origin points are sampled within a circle and should always be < sim threshold - # Since origin of two points lies within threshold limits,they should be similar - # when we check just origin for similarity. - - - trips = etmm.generate_mock_trips('alice', 2, [0, 0], [1, 1], 'o_',threshold=generate_points_thresh) - coords0 = metric.extract_features(trips[0])[:2] - coords1 = metric.extract_features(trips[1])[:2] - similarO = metric.similar(coords0, coords1, similarity_threshold) - - ##Sub-Test 5 : - # random, but, only destination points are sampled within a circle and should always be < sim threshold - # Since destination of two points lies within threshold limits,they should be similar - # when we check just destination for similarity. - - trips = etmm.generate_mock_trips('Caty', 2, [0, 0], [1, 1], '_d',threshold=generate_points_thresh) - coords0 = metric.extract_features(trips[0])[2:] - coords1 = metric.extract_features(trips[1])[2:] - similarD = metric.similar(coords0, coords1, similarity_threshold) - # All the similars must be true - self.assertTrue(similarOD1) # RESULT SUB-TEST 1 - self.assertTrue(similarOD2) # RESULT SUB-TEST 2 - self.assertTrue(similarOD3) # RESULT SUB-TEST 3 - self.assertTrue(similarO) # RESULT SUB-TEST 4 - self.assertTrue(similarD) # RESULT SUB-TEST 5 + # parameters passed for testing is set here. A list, where each element of this list takes the form + # [trip part to be sampled within mentioned threshold, (start_coord,end_coord)] + # Since the extracted_features function returns in the form [origin_lat,origin_long,destination_lat,destination_long], + # if clustering is to be done by : + # a.origin, we pass first two values of this list,i.e. from 0 till before 2 index + # b.destination, we pas last two values of this list,i.e. from 2 till before 4 index + # c.origin-destination, we pass the entire list , i.e. from 0 till before 4 index + parameters= [["od",(0,4)],["_d",(2,4)],["o_",(0,2)]] + + for tp,(coord_start,coord_end) in parameters: + with self.subTest(trip_part=tp): + #generate 2 trips with parameter values + trips = etmu.setTripConfig(2, [0, 0], [1, 1], trip_part=tp,threshold=generate_points_thresh) + # depending on the parametrs, extract the relevant coordinates + trip0_coords = metric.extract_features(trips[0])[coord_start:coord_end] + trip1_coords = metric.extract_features(trips[1])[coord_start:coord_end] + #check for similarity using relevant coordinates + similarOD = metric.similar(trip0_coords,trip1_coords, similarity_threshold) + # Since both origin and destination poitns lie within threshold limits,they should be similar + # when we check by just origin or just destination or both origin-and-destination + self.assertTrue(similarOD) def testODsAreNotSimilar(self): - generate_points_thresh = 0.001 # approx. 111 meters - similarity_threshold = 111 # - metric = eamso.OriginDestinationSimilarity() - - ## Sub-Test 1-2: - # Two trips with neither origin nor destination coordinates within threshold - # must not be similar in any configuration of similarity testing. - trips = etmm.generate_mock_trips('bob', 2, [0, 0], [1, 1], '__', threshold=generate_points_thresh) - coords0 = metric.extract_features(trips[0]) - coords1 = metric.extract_features(trips[1]) - similar11 = metric.similar(coords0[:2], coords1[:2], similarity_threshold) - similar12 = metric.similar(coords0[2:], coords1[:], similarity_threshold) - - ## Sub-Test 3-4: - # Two trips with origin coordinates within threshold but we check - # similarity using destination coordinates or origin-and-destination - # should not be similar. - trips = etmm.generate_mock_trips('Alice', 2, [2, 2], [3, 3], 'o_', threshold=generate_points_thresh) + similarity_threshold = 500 metric = eamso.OriginDestinationSimilarity() - coords0 = metric.extract_features(trips[0]) - coords1 = metric.extract_features(trips[1]) - similar21 = metric.similar(coords0[2:], coords1[2:], similarity_threshold) - similar22 = metric.similar(coords0, coords1, similarity_threshold) - - ## Sub-Test 5-6: - # Two trips with destination coordinates within threshold but we check - # similarity using origin coordinates or origin-and-destination - # should not be similar. - trips = etmm.generate_mock_trips('Caty', 2, [3, 3], [4, 4], '_d', threshold=generate_points_thresh) - metric = eamso.OriginDestinationSimilarity() - coords0 = metric.extract_features(trips[0]) - coords1 = metric.extract_features(trips[1]) - similar31 = metric.similar(coords0[:2], coords1[:2], similarity_threshold) - similar32 = metric.similar(coords0, coords1, similarity_threshold) - - # All the similars must be False - self.assertFalse(similar11) # RESULT SUB-TEST 1 - self.assertFalse(similar12) # RESULT SUB-TEST 2 - self.assertFalse(similar21) # RESULT SUB-TEST 3 - self.assertFalse(similar22) # RESULT SUB-TEST 4 - self.assertFalse(similar31) # RESULT SUB-TEST 5 - self.assertFalse(similar32) # RESULT SUB-TEST 6 + # parameters passed for testing is set. A list, where each element of this list takes the form + # [(start_coord,end_coord)] + # Since the extracted_features function return in the form [origin_lat,origin_long,destination_lat,destination_long], + # if clustering shouldn't happend, then + # a.origin, we pass first two values of this list,i.e. from 0 till before 2 index + # b.destination, we pas last two values of this list,i.e. from 2 till before 4 index + # c.origin-destination, we pass the entire list , i.e. from 0 till before 4 index + parameters= [(0,2),(2,4),[0,4]] + n=2 + #this generates 2 trips one-by-one, where each trip's respective origin and destination + # points are more than 500m away. + trips = [etmu.setTripConfig(1, (i, i), (i+1, i+1), 'od', 1)[0] for i in range(n)] + trip0_coord = metric.extract_features(trips[0]) + trip1_coord = metric.extract_features(trips[1]) + + for (coord_start,coord_end) in parameters: + with self.subTest(coordinates=(coord_start,coord_end)): + IsSimilar = metric.similar(trip0_coord[coord_start:coord_end],trip1_coord[coord_start:coord_end], similarity_threshold) + # Two trips with neither origin nor destination coordinates within the threshold + # must not be similar by any configuration of similarity testing. + self.assertFalse(IsSimilar) if __name__ == '__main__': unittest.main() diff --git a/emission/tests/modellingTests/utilities.py b/emission/tests/modellingTests/utilities.py new file mode 100644 index 000000000..9f03358bb --- /dev/null +++ b/emission/tests/modellingTests/utilities.py @@ -0,0 +1,68 @@ +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg +import emission.tests.modellingTests.modellingTestAssets as etmm + +def setModelConfig(metric,threshold,cutoff,clustering_way,incrementalevaluation): + """ + TODO : tell about each param. + pass in a test configuration to the binning algorithm. + + clustering_way : Part of the trip used for checking pairwise proximity. + Can take one of the three values: + + 1. 'origin' -> using origin of the trip to check if 2 points + lie within the mentioned similarity_threshold_meters + 2. 'destination' -> using destination of the trip to check if 2 points + lie within the mentioned similarity_threshold_meters + 3. 'origin-destination' -> both origin and destination of the trip to check + if 2 points lie within the mentioned + similarity_threshold_meters + """ + model_config = { + "metric": metric, + "similarity_threshold_meters": threshold, # meters, + "apply_cutoff": cutoff, + "clustering_way": clustering_way, + "incremental_evaluation": incrementalevaluation + } + + return eamtg.GreedySimilarityBinning(model_config) + + +def setTripConfig(trips,org,dest,trip_part,within_thr=None,label_data=None,threshold=0.001): + """ + TODO: Tell about each + trip_part: when mock trips are generated, coordinates of this part of + m trips will be within the threshold. trip_part can take one + among the four values: + + 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie + within the mentioned threshold when trips are generated), + + 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned + threshold when trips are generated), + + 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the + mentioned threshold when trips are generated) + + 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips + will lie within the mentioned threshold when trips are generated) + """ + if label_data == None: + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } + + trip =etmm.generate_mock_trips( + user_id="joe", + trips=trips, + origin=org, + destination=dest, + trip_part=trip_part, + label_data=label_data, + within_threshold=within_thr, + threshold=threshold, + ) + return trip + \ No newline at end of file From c35b7c1b6ce94786511f0884d04445e50625a157 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Fri, 8 Sep 2023 18:08:46 -0400 Subject: [PATCH 07/10] Changed logic for Random trip genration Random trips are now generated like this : if certain trips is are to be binned together ( by 'o','d' or 'od' or '__' (meaning NONE)) they are generated in proximity of the previous in-bin trip. Otherwise, if they are not to be binned together, we keep generating a random trip unless we find one that would not bin with previously accepted trips. --- .../modellingTests/modellingTestAssets.py | 164 +++++++++++++++--- emission/tests/modellingTests/utilities.py | 68 -------- 2 files changed, 137 insertions(+), 95 deletions(-) delete mode 100644 emission/tests/modellingTests/utilities.py diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index cb886670c..f98736048 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -1,35 +1,145 @@ import random from typing import Optional, Tuple, List, Dict from uuid import UUID -import emission.analysis.modelling.trip_model.trip_model as eamtm +import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg +import emission.tests.modellingTests.modellingTestAssets as etmm import emission.core.wrapper.confirmedtrip as ecwc import emission.core.wrapper.entry as ecwe import time import math +def setModelConfig(metric,threshold,cutoff,clustering_way,incrementalevaluation): + """ + TODO: Write about each parameter to the function + pass in a test configuration to the binning algorithm. + + clustering_way : Part of the trip used for checking pairwise proximity. + Can take one of the three values: + + 1. 'origin' -> using origin of the trip to check if 2 points + lie within the mentioned similarity_threshold_meters + 2. 'destination' -> using destination of the trip to check if 2 points + lie within the mentioned similarity_threshold_meters + 3. 'origin-destination' -> both origin and destination of the trip to check + if 2 points lie within the mentioned + similarity_threshold_meters + """ + model_config = { + "metric": metric, + "similarity_threshold_meters": threshold, # meters, + "apply_cutoff": cutoff, + "clustering_way": clustering_way, + "incremental_evaluation": incrementalevaluation + } + + return eamtg.GreedySimilarityBinning(model_config) + +def generate_random_point(): + """Generate a completetly random point valid WGS84 latitiude and longtidude""" + lat=random.uniform(-90,90) + lon=random.uniform(-180,180) + return [lat,lon] + +def generate_nearby_random_points(ref_coords,threshold): + """ + Generate valid WGS84 latitiude and longtidude in threshold(m) proximity to + ref coordinates + """ + + thresholdInWGS84 = threshold* (0.000001/0.11) + dx=random.uniform(-thresholdInWGS84/2,thresholdInWGS84/2) + dy=random.uniform(-thresholdInWGS84/2,thresholdInWGS84/2) + return [ref_coords[0] +dx , ref_coords[1] +dy] + +def calDistanceTest(point1, point2, coordinates=False): + """haversine distance + + :param point1: a coordinate in degrees WGS84 + :param point2: another coordinate in degrees WGS84 + :param coordinates: if false, expect a list of coordinates, defaults to False + :return: distance approximately in meters + """ + earthRadius = 6371000 # meters + if coordinates: + dLat = math.radians(point1.lat-point2.lat) + dLon = math.radians(point1.lon-point2.lon) + lat1 = math.radians(point1.lat) + lat2 = math.radians(point2.lat) + else: + dLat = math.radians(point1[1]-point2[1]) + dLon = math.radians(point1[0]-point2[0]) + lat1 = math.radians(point1[1]) + lat2 = math.radians(point2[1]) + + + a = (math.sin(dLat/2) ** 2) + ((math.sin(dLon/2) ** 2) * math.cos(lat1) * math.cos(lat2)) + c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)) + d = earthRadius * c + + return d + +def setTripConfig(trips,trip_part,threshold,within_thr=None,label_data=None): + """ + TODO: Write about each parameter to the function + trip_part: when mock trips are generated, coordinates of this part of + m trips will be within the threshold. trip_part can take one + among the four values: + + 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie + within the mentioned threshold when trips are generated), + + 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned + threshold when trips are generated), + + 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the + mentioned threshold when trips are generated) + + 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips + will lie within the mentioned threshold when trips are generated) + """ + if label_data == None: + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } + + trip =etmm.generate_mock_trips( + user_id="joe", + trips=trips, + trip_part=trip_part, + label_data=label_data, + within_threshold=within_thr, + threshold=threshold, + ) + return trip def generate_trip_coordinates( - ref_coords: Tuple[float, float], + points_list: list[float], within_threshold: bool, - threshold: float, - max: float = 0.1 # approx. 10km in WGS84 + threshold_meters: float, ) -> Tuple[float, float]: - """generates trip coordinate data to use when mocking a set of trip data. + """generates trip coordinate data to use when mocking a set of trip data.i + If the coordinate generated is to be binned together, it is generated in proximity of + the previous points in the points_list. Otherwise, if this point is not to be binned together, + keep generating a random trip unless we find one that would not bin with previously + accepeted trips. - :param ref_coords: reference coordinates to use as the center of the sampling circle - :param within_threshold: how many of these trips are within some distance threshold - :param threshold: the distance threshold, in WGS84 - :param max: max distance, in WGS84, defaults to 0.1 (approx. 10km) + :param points_list: list of all the previoushlt selected points + :param within_threshold: is this point supposed to be within some distance threshold + :param threshold_meters: the distance threshold, in meters :return: generated coordinate pairs sampled in a circle from some coordinates up to some threshold """ - angle = 2 * math.pi * random.random() - radius_threshold = threshold / 2 - radius = random.uniform(0, radius_threshold) if within_threshold else random.uniform(radius_threshold, max) - x = radius * math.cos(angle) + ref_coords[0] - y = radius * math.sin(angle) + ref_coords[1] - return (x, y) + + if within_threshold and points_list: + new_point = generate_nearby_random_points(random.choice(points_list), threshold_meters) + else: + new_point = generate_random_point() + while not all(calDistanceTest(new_point, pt) > threshold_meters for pt in points_list): + new_point = generate_random_point() + return new_point def extract_trip_labels(trips: List[ecwc.Confirmedtrip]) -> Dict: @@ -129,15 +239,12 @@ def build_mock_trip( def generate_mock_trips( user_id, trips, - origin, - destination, + threshold, trip_part='od', label_data = None, within_threshold = None, start_ts: None = None, end_ts: None = None, - threshold = 0.01, - max = 0.1, has_label_p = 1.0, seed = 0): """mocking function that generates multiple trips for a user. some are sampled @@ -158,8 +265,6 @@ def generate_mock_trips( :param user_id: user UUID :param trips: number of trips - :param origin: origin coordinates - :param destination: destination coordinates :param trip_part: when mock trips are generated, coordinates of this part of the trips will be within the threshold. trip_part can take one among the four values: @@ -173,10 +278,8 @@ def generate_mock_trips( will lie within the mentioned threshold when trips are generated) :param label_data: dictionary of label data, see above, defaults to None :param within_threshold: number of trips that should fall within the provided - distance threshold in degrees WGS84, defaults to None - :param threshold: distance threshold in WGS84 for sampling, defaults to 0.01 - :param max: maximum distance beyond the threshold for trips sampled that - are not within the threshold, defaults to 0.1 degrees WGS84 + distance threshold in m + :param threshold: distance threshold in WGS84 for sampling :param has_label_p: probability a trip has labels, defaults to 1.0 :param seed: random seed, defaults to 0 :return: randomly sampled trips @@ -186,9 +289,16 @@ def generate_mock_trips( within = within_threshold if within_threshold is not None else trips trips_within_threshold = [i < within for i in range(trips)] result = [] + origin_points=[] + destination_points=[] + + # generate trip number of points based on which among 'o' ,'d' or 'od' should be in threshold + # proximity to each other. for within in trips_within_threshold: - o = generate_trip_coordinates(origin, (trip_part[0] == 'o' and within), threshold, max) - d = generate_trip_coordinates(destination, (trip_part[1] == 'd' and within), threshold, max) + origin_points.append(generate_trip_coordinates(origin_points, (trip_part[0] == 'o' and within), threshold)) + destination_points.append(generate_trip_coordinates(destination_points, (trip_part[1] == 'd' and within), threshold)) + + for o,d in zip(origin_points,destination_points): labels = {} if label_data is None or random.random() > has_label_p \ else sample_trip_labels( mode_labels=label_data.get('mode_confirm'), diff --git a/emission/tests/modellingTests/utilities.py b/emission/tests/modellingTests/utilities.py deleted file mode 100644 index 9f03358bb..000000000 --- a/emission/tests/modellingTests/utilities.py +++ /dev/null @@ -1,68 +0,0 @@ -import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg -import emission.tests.modellingTests.modellingTestAssets as etmm - -def setModelConfig(metric,threshold,cutoff,clustering_way,incrementalevaluation): - """ - TODO : tell about each param. - pass in a test configuration to the binning algorithm. - - clustering_way : Part of the trip used for checking pairwise proximity. - Can take one of the three values: - - 1. 'origin' -> using origin of the trip to check if 2 points - lie within the mentioned similarity_threshold_meters - 2. 'destination' -> using destination of the trip to check if 2 points - lie within the mentioned similarity_threshold_meters - 3. 'origin-destination' -> both origin and destination of the trip to check - if 2 points lie within the mentioned - similarity_threshold_meters - """ - model_config = { - "metric": metric, - "similarity_threshold_meters": threshold, # meters, - "apply_cutoff": cutoff, - "clustering_way": clustering_way, - "incremental_evaluation": incrementalevaluation - } - - return eamtg.GreedySimilarityBinning(model_config) - - -def setTripConfig(trips,org,dest,trip_part,within_thr=None,label_data=None,threshold=0.001): - """ - TODO: Tell about each - trip_part: when mock trips are generated, coordinates of this part of - m trips will be within the threshold. trip_part can take one - among the four values: - - 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie - within the mentioned threshold when trips are generated), - - 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned - threshold when trips are generated), - - 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the - mentioned threshold when trips are generated) - - 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips - will lie within the mentioned threshold when trips are generated) - """ - if label_data == None: - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive'] - } - - trip =etmm.generate_mock_trips( - user_id="joe", - trips=trips, - origin=org, - destination=dest, - trip_part=trip_part, - label_data=label_data, - within_threshold=within_thr, - threshold=threshold, - ) - return trip - \ No newline at end of file From f5944ccef1140db347211c93b851b8864c0362ee Mon Sep 17 00:00:00 2001 From: $aTyam Date: Tue, 12 Sep 2023 12:27:47 -0400 Subject: [PATCH 08/10] [TESTED] Explicit clustering method, Improved mock trip generation `od_similarity.py` 1. Explicitly passing 'origin', 'destination', 'origin-destination' for similarity check in `similarity` `similarity_metric.py` 2. Passing the clustering_way parameter `greedy_similarity_binning.py` 3. Since this decision making is moved downstream to `similarity`, so removing it from here. `modellingTestAssets.py` 4. Removing both 2 line wrappers (SetModelConfig, setTripConfig ) from this file since this was parametrised using sub-Test 2 commits back. 5. Removed CalDistanceTest. This was introduced to keep calDistance of test separate from the calDistance being used by the one being used by `greedySimilaritybinning`. Unnecesary. 6. Using ref. coordinates whenever provided to generate trip coordinates. If not, use randomly generated coordinates as reference points. 7. receiving and passing origin and destination ref. points. in `generate_mock_trips' `TestGreedySimilarityBinning.py` 8. removed wrappers for trip and model generation. 9. Using just single threshold for generating trips and for binning. Removed two thresholds. `TestSimilarityMetric.py` 10. Removing the implicitness used in binning by passing this as a parameter. --- .../modelling/similarity/od_similarity.py | 34 ++-- .../modelling/similarity/similarity_metric.py | 12 +- .../trip_model/greedy_similarity_binning.py | 9 +- .../TestGreedySimilarityBinning.py | 109 ++++++++++--- .../modellingTests/TestSimilarityMetric.py | 26 ++-- .../modellingTests/modellingTestAssets.py | 145 +++++------------- 6 files changed, 168 insertions(+), 167 deletions(-) diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py index 9a6a49d0d..056c721a3 100644 --- a/emission/analysis/modelling/similarity/od_similarity.py +++ b/emission/analysis/modelling/similarity/od_similarity.py @@ -15,24 +15,28 @@ class OriginDestinationSimilarity(eamss.SimilarityMetric): def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: return ctfe.od_features(trip) - def similarity(self, a: List[float], b: List[float]) -> List[float]: + def similarity(self, a: List[float], b: List[float], clustering_way='origin-destination') -> List[float]: """ - a : a list of point features that can take either of two forms - 1. [point1_latitude,point1_longitude] - 2. [point1_latitude,point1_longitude,point2_latitude,point2_longitude] + a : a list of point features that takes the forms + [point1_longitude,point1_latitude,point2_longitude,point2_latitude] - b : a list of point features that can take either of two forms - 1. [point3_latitude,point3_longitude] - 2. [point3_latitude,point3_longitude,point4_latitude,point4_longitude] - - It'll always take the same form as parameter a. - + b : a list of point features that takes the forms + [point1_longitude,point1_latitude,point2_longitude,point2_latitude] + + clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value. + tells the part of the trip to be used for binning trips together if that + part lies within threshold. + return: a list of size 1 ([distance between point1-point3]) if a and b take form 1 or of size 2 ([distance between point1-point3, distance between point2-point4]) if a and b take form 2. """ - - point_dist = [ecc.calDistance(a[i:i+2], b[i:i+2]) - for i in range (0,len(a),2)] - - return point_dist \ No newline at end of file + origin_dist = ecc.calDistance(a[0:2], b[0:2]) + destination_dist=ecc.calDistance(a[2:4], b[2:4]) + + if clustering_way == 'origin-destination': + return [origin_dist,destination_dist] + elif clustering_way == 'origin': + return [origin_dist] + else: + return [destination_dist] \ No newline at end of file diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py index 1b520318f..c009be9e9 100644 --- a/emission/analysis/modelling/similarity/similarity_metric.py +++ b/emission/analysis/modelling/similarity/similarity_metric.py @@ -17,26 +17,32 @@ def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]: pass @abstractmethod - def similarity(self, a: List[float], b: List[float]) -> List[float]: + def similarity(self, a: List[float], b: List[float], clustering_way = 'origin-destination') -> List[float]: """compares the features, producing their similarity as computed by this similarity metric :param a: features for a trip :param b: features for another trip + :param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value. + tells the part of the trip to be used for binning trips together if that + part lies within a threshold. :return: for each feature, the similarity of these features """ pass - def similar(self, a: List[float], b: List[float], thresh: float) -> bool: + def similar(self, a: List[float], b: List[float], thresh: float, clustering_way= 'origin-destination') -> bool: """compares the features, returning true if they are similar within some threshold :param a: features for a trip :param b: features for another trip :param thresh: threshold for similarity + :param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value. + tells the part of the trip to be used for binning trips together if that + part lies within a threshold. :return: true if the feature similarity is within some threshold """ - similarity_values = self.similarity(a, b) + similarity_values = self.similarity(a, b, clustering_way) is_similar = all(sim <= thresh for sim in similarity_values) return is_similar diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index efcce4f02..226fdefb5 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -212,14 +212,7 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]: :return: the id of a bin if a match was found, otherwise None """ for bin_id, bin_record in self.bins.items(): - if self.clusteringWay == 'origin': - start,end=0,2 #since first two features in trip_features are for origin - elif self.clusteringWay == 'destination': - start,end=2,4 #third and fourth values intrip_features are for destination - elif self.clusteringWay == 'origin-destination': - start,end=0,4 #when clusteromgWay is 'origin-destination',we pass all four features - - matches_bin = all([self.metric.similar(trip_features[start:end], bin_sample[start:end], self.sim_thresh) + matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh,self.clusteringWay) for bin_sample in bin_record['feature_rows']]) if matches_bin: return bin_id diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index 3e1cd78c2..937effc94 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -1,6 +1,7 @@ import unittest import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg -import emission.tests.modellingTests.utilities as etmu +import emission.tests.modellingTests.modellingTestAssets as etmm + import logging @@ -21,11 +22,29 @@ def testNoBinning(self): """ # generate $n trips. - n = 20 - + n = 20 + binning_threshold=500 #this generates 20 trips one-by-one, where each trip's respective origin and destination # points are more than 500m away. - trips = [ etmu.setTripConfig(1, (i, i), (i+1, i+1), 'od', 1)[0] for i in range(n)] + + + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } + + + trips =etmm.generate_mock_trips( + user_id="joe", + trips=n, + trip_part='__', + label_data=label_data, + within_threshold=1, + threshold=binning_threshold, + origin=(0,0), + destination=(1,1) + ) # parameters passed for testing. A list, where each element is one way of clustering clustering_ways_paramters= ["origin","destination","origin-destination"] @@ -34,7 +53,14 @@ def testNoBinning(self): for cw in clustering_ways_paramters: with self.subTest(clustering_way=cw): #initialise the binning model and fit with previously generated trips - model = etmu.setModelConfig("od_similarity", 500, False, cw, False) + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": binning_threshold, # meters, + "apply_cutoff": False, + "clustering_way": cw, + "incremental_evaluation": False + } + model= eamtg.GreedySimilarityBinning(model_config) model.fit(trips) #check each bins for no of trips no_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model.bins.values())) @@ -53,6 +79,12 @@ def testBinning(self): # within a radius that should have them binned. n = 20 m = 5 + binning_threshold=500 + label_data = { + "mode_confirm": ['walk', 'bike', 'transit'], + "purpose_confirm": ['work', 'home', 'school'], + "replaced_mode": ['drive'] + } # parameters passed for testing. A list, where each element of this list takes the form # [trip part to be sampled within mentioned threshold , clustering way used to check similarity] @@ -60,10 +92,25 @@ def testBinning(self): for tp,cw in parameters: with self.subTest(trip_part=tp,clustering_way=cw): #generate random trips using utilities - trips = etmu.setTripConfig(trips=n, org=(0, 0), dest=(1, 1), - trip_part=tp, within_thr=m) + trips =etmm.generate_mock_trips( + user_id="joe", + trips=n, + trip_part=tp, + label_data=label_data, + within_threshold=m, + threshold=binning_threshold, + origin=(0,0), + destination=(1,1) + ) #initialise the binning model and fit with previously generated trips - model = etmu.setModelConfig("od_similarity", 500, False, cw, False) + model_config = { + "metric": "od_similarity" , + "similarity_threshold_meters": binning_threshold, # meters, + "apply_cutoff": False, + "clustering_way": cw, + "incremental_evaluation": False + } + model = eamtg.GreedySimilarityBinning(model_config) model.fit(trips) #check each bins for no of trips at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values())) @@ -81,11 +128,24 @@ def testPrediction(self): } n = 6 - trips = etmu.setTripConfig(trips=n, org=(0, 0), dest=(1, 1), - trip_part='od', label_data=label_data, - ) - model = etmu.setModelConfig("od_similarity", 500, False, "origin-destination", False) - + trips =etmm.generate_mock_trips( + user_id="joe", + trips=n, + trip_part='od', + label_data=label_data, + within_threshold=n, + threshold=500, + origin=(0,0), + destination=(1,1) + ) + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "clustering_way": 'origin_destination', + "incremental_evaluation": False + } + model= eamtg.GreedySimilarityBinning(model_config) train = trips[0:5] test = trips[5] @@ -105,16 +165,25 @@ def testNoPrediction(self): "replaced_mode": ['crabwalking'] } n = 5 - - train = etmu.setTripConfig(trips=n, org=(39.7645187, -104.9951944), # Denver, CO - dest=(39.7435206, -105.2369292), # Golden, CO - trip_part='od', label_data=label_data + binning_threshold = 500 + train = etmm.generate_mock_trips( user_id="joe",trips=n, origin=(39.7645187, -104.9951944), # Denver, CO + destination=(39.7435206, -105.2369292), # Golden, CO + trip_part='od', label_data=label_data, + threshold=binning_threshold, within_threshold=n ) - test = etmu.setTripConfig(trips=n, org=(61.1042262, -150.5611644), # Denver, CO - dest=(62.2721466, -150.3233046), # Golden, CO + test = etmm.generate_mock_trips( user_id="amanda",trips=n, origin=(61.1042262, -150.5611644), # Denver, CO + destination=(62.2721466, -150.3233046), # Golden, CO trip_part='od', label_data=label_data, + threshold=binning_threshold, within_threshold=n ) - model = etmu.setModelConfig("od_similarity", 500, False, "origin-destination", False) + model_config = { + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "clustering_way": 'origin_destination', + "incremental_evaluation": False + } + model= eamtg.GreedySimilarityBinning(model_config) model.fit(train) results, n = model.predict(test[0]) diff --git a/emission/tests/modellingTests/TestSimilarityMetric.py b/emission/tests/modellingTests/TestSimilarityMetric.py index cbe500b23..fe038be4e 100644 --- a/emission/tests/modellingTests/TestSimilarityMetric.py +++ b/emission/tests/modellingTests/TestSimilarityMetric.py @@ -1,11 +1,9 @@ import unittest import emission.analysis.modelling.similarity.od_similarity as eamso -import emission.tests.modellingTests.utilities as etmu - +import emission.tests.modellingTests.modellingTestAssets as etmm class TestSimilarityMetric(unittest.TestCase): def testODsAreSimilar(self): - generate_points_thresh = 0.001 # approx. 111 meters similarity_threshold = 500 # in meters metric = eamso.OriginDestinationSimilarity() @@ -16,17 +14,17 @@ def testODsAreSimilar(self): # a.origin, we pass first two values of this list,i.e. from 0 till before 2 index # b.destination, we pas last two values of this list,i.e. from 2 till before 4 index # c.origin-destination, we pass the entire list , i.e. from 0 till before 4 index - parameters= [["od",(0,4)],["_d",(2,4)],["o_",(0,2)]] + parameters= [["o_",'origin'],["_d",'destination'],["od",'origin-destination']] - for tp,(coord_start,coord_end) in parameters: + for tp,cw in parameters: with self.subTest(trip_part=tp): #generate 2 trips with parameter values - trips = etmu.setTripConfig(2, [0, 0], [1, 1], trip_part=tp,threshold=generate_points_thresh) + trips = etmm.generate_mock_trips('joe',2, threshold=similarity_threshold,origin=[0, 0], destination=[1, 1], within_threshold=2,trip_part=tp) # depending on the parametrs, extract the relevant coordinates - trip0_coords = metric.extract_features(trips[0])[coord_start:coord_end] - trip1_coords = metric.extract_features(trips[1])[coord_start:coord_end] + trip0_coords = metric.extract_features(trips[0]) + trip1_coords = metric.extract_features(trips[1]) #check for similarity using relevant coordinates - similarOD = metric.similar(trip0_coords,trip1_coords, similarity_threshold) + similarOD = metric.similar(trip0_coords,trip1_coords, similarity_threshold,cw) # Since both origin and destination poitns lie within threshold limits,they should be similar # when we check by just origin or just destination or both origin-and-destination self.assertTrue(similarOD) @@ -42,17 +40,17 @@ def testODsAreNotSimilar(self): # a.origin, we pass first two values of this list,i.e. from 0 till before 2 index # b.destination, we pas last two values of this list,i.e. from 2 till before 4 index # c.origin-destination, we pass the entire list , i.e. from 0 till before 4 index - parameters= [(0,2),(2,4),[0,4]] + parameters= ['origin','destination','origin-destination'] n=2 #this generates 2 trips one-by-one, where each trip's respective origin and destination # points are more than 500m away. - trips = [etmu.setTripConfig(1, (i, i), (i+1, i+1), 'od', 1)[0] for i in range(n)] + trips = [ etmm.generate_mock_trips('joe',2, origin=[i, i], destination=[i+1, i+1], trip_part= 'od', within_threshold=1,threshold=500)[0] for i in range(n)] trip0_coord = metric.extract_features(trips[0]) trip1_coord = metric.extract_features(trips[1]) - for (coord_start,coord_end) in parameters: - with self.subTest(coordinates=(coord_start,coord_end)): - IsSimilar = metric.similar(trip0_coord[coord_start:coord_end],trip1_coord[coord_start:coord_end], similarity_threshold) + for cw in parameters: + with self.subTest(clustering_way=cw): + IsSimilar = metric.similar(trip0_coord,trip1_coord, similarity_threshold,cw) # Two trips with neither origin nor destination coordinates within the threshold # must not be similar by any configuration of similarity testing. self.assertFalse(IsSimilar) diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index f98736048..9ad662fe3 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -2,122 +2,39 @@ from typing import Optional, Tuple, List, Dict from uuid import UUID import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg -import emission.tests.modellingTests.modellingTestAssets as etmm import emission.core.wrapper.confirmedtrip as ecwc - +import emission.core.common as ecc import emission.core.wrapper.entry as ecwe import time import math -def setModelConfig(metric,threshold,cutoff,clustering_way,incrementalevaluation): - """ - TODO: Write about each parameter to the function - pass in a test configuration to the binning algorithm. - - clustering_way : Part of the trip used for checking pairwise proximity. - Can take one of the three values: - - 1. 'origin' -> using origin of the trip to check if 2 points - lie within the mentioned similarity_threshold_meters - 2. 'destination' -> using destination of the trip to check if 2 points - lie within the mentioned similarity_threshold_meters - 3. 'origin-destination' -> both origin and destination of the trip to check - if 2 points lie within the mentioned - similarity_threshold_meters - """ - model_config = { - "metric": metric, - "similarity_threshold_meters": threshold, # meters, - "apply_cutoff": cutoff, - "clustering_way": clustering_way, - "incremental_evaluation": incrementalevaluation - } - - return eamtg.GreedySimilarityBinning(model_config) - def generate_random_point(): - """Generate a completetly random point valid WGS84 latitiude and longtidude""" + """Generate a completetly random point valid WGS84 latitiude and longtidude. + CAUTION : In order to save trips, GeoJSON requires points in [lon,lat] format""" lat=random.uniform(-90,90) lon=random.uniform(-180,180) - return [lat,lon] + return [lon,lat] def generate_nearby_random_points(ref_coords,threshold): """ Generate valid WGS84 latitiude and longtidude in threshold(m) proximity to - ref coordinates + ref coordinates. """ - + #convert given threshold in m to approx WGS84 coord dist. thresholdInWGS84 = threshold* (0.000001/0.11) + + #generate a random coordinate in threshold's limit around the ref points. dx=random.uniform(-thresholdInWGS84/2,thresholdInWGS84/2) dy=random.uniform(-thresholdInWGS84/2,thresholdInWGS84/2) - return [ref_coords[0] +dx , ref_coords[1] +dy] - -def calDistanceTest(point1, point2, coordinates=False): - """haversine distance - - :param point1: a coordinate in degrees WGS84 - :param point2: another coordinate in degrees WGS84 - :param coordinates: if false, expect a list of coordinates, defaults to False - :return: distance approximately in meters - """ - earthRadius = 6371000 # meters - if coordinates: - dLat = math.radians(point1.lat-point2.lat) - dLon = math.radians(point1.lon-point2.lon) - lat1 = math.radians(point1.lat) - lat2 = math.radians(point2.lat) - else: - dLat = math.radians(point1[1]-point2[1]) - dLon = math.radians(point1[0]-point2[0]) - lat1 = math.radians(point1[1]) - lat2 = math.radians(point2[1]) - - a = (math.sin(dLat/2) ** 2) + ((math.sin(dLon/2) ** 2) * math.cos(lat1) * math.cos(lat2)) - c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)) - d = earthRadius * c - - return d - -def setTripConfig(trips,trip_part,threshold,within_thr=None,label_data=None): - """ - TODO: Write about each parameter to the function - trip_part: when mock trips are generated, coordinates of this part of - m trips will be within the threshold. trip_part can take one - among the four values: - - 1. '__' ->(None, meaning NEITHER origin nor destination of any trip will lie - within the mentioned threshold when trips are generated), - - 2. 'o_' ->(origin, meaning ONLY origin of m trips will lie within the mentioned - threshold when trips are generated), - - 3. '_d' ->(destination),meaning ONLY destination of m trips will lie within the - mentioned threshold when trips are generated) - - 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips - will lie within the mentioned threshold when trips are generated) - """ - if label_data == None: - label_data = { - "mode_confirm": ['walk', 'bike', 'transit'], - "purpose_confirm": ['work', 'home', 'school'], - "replaced_mode": ['drive'] - } - - trip =etmm.generate_mock_trips( - user_id="joe", - trips=trips, - trip_part=trip_part, - label_data=label_data, - within_threshold=within_thr, - threshold=threshold, - ) - return trip + #This basically gives a way to sample a point from within a square of length thresholdInWGS84 + # around the ref. point. + return [ref_coords[0] +dx , ref_coords[1] +dy] def generate_trip_coordinates( - points_list: list[float], - within_threshold: bool, + points_list: list[float], + ref_coords, + InsideThreshold: bool, threshold_meters: float, ) -> Tuple[float, float]: """generates trip coordinate data to use when mocking a set of trip data.i @@ -132,12 +49,22 @@ def generate_trip_coordinates( :return: generated coordinate pairs sampled in a circle from some coordinates up to some threshold """ - - if within_threshold and points_list: - new_point = generate_nearby_random_points(random.choice(points_list), threshold_meters) - else: - new_point = generate_random_point() - while not all(calDistanceTest(new_point, pt) > threshold_meters for pt in points_list): + # if the point is to be generated within a threshold and it's not the first point + if InsideThreshold and points_list: + # if no ref. coordinates are provided, use any previously accepted point as ref. + if ref_coords == None: + ref_coords=random.choice(points_list) + # generate a new point in threshold proximity to ref. point + new_point = generate_nearby_random_points(ref_coords, threshold_meters) + else: # If point need not be in the threshold OR if its the first point we are generating, then + #Generate random coordinates if no reference coords were provided + if ref_coords == None: + new_point = generate_random_point() + else: + # if ref coordinate are provided, use them as the starting point and iterate till required + # condition is satisfied + new_point = ref_coords + while not all(ecc.calDistance(new_point, pt) > threshold_meters for pt in points_list): new_point = generate_random_point() return new_point @@ -241,6 +168,8 @@ def generate_mock_trips( trips, threshold, trip_part='od', + origin=None, + destination=None, label_data = None, within_threshold = None, start_ts: None = None, @@ -276,6 +205,8 @@ def generate_mock_trips( mentioned threshold when trips are generated) 4. 'od' ->(origin and destination,meaning BOTH origin and destination of m trips will lie within the mentioned threshold when trips are generated) + :param origin : reference point for trip origin generally + :param destination : reference point for trip origin generally :param label_data: dictionary of label data, see above, defaults to None :param within_threshold: number of trips that should fall within the provided distance threshold in m @@ -292,11 +223,11 @@ def generate_mock_trips( origin_points=[] destination_points=[] - # generate trip number of points based on which among 'o' ,'d' or 'od' should be in threshold - # proximity to each other. + # generate 'trip' number of points based on which among 'o' (Origin) ,'d' (Destination) or + # 'od' (Origin-Destination) or '__' (None) should be in threshold proximity to each other. for within in trips_within_threshold: - origin_points.append(generate_trip_coordinates(origin_points, (trip_part[0] == 'o' and within), threshold)) - destination_points.append(generate_trip_coordinates(destination_points, (trip_part[1] == 'd' and within), threshold)) + origin_points.append(generate_trip_coordinates(origin_points, origin, InsideThreshold= (trip_part[0] == 'o' and within), threshold_meters= threshold)) + destination_points.append(generate_trip_coordinates(destination_points, destination, InsideThreshold=(trip_part[1] == 'd' and within), threshold_meters=threshold)) for o,d in zip(origin_points,destination_points): labels = {} if label_data is None or random.random() > has_label_p \ From 1c0526bb43d07807c9297dd2584ee926a27bf62b Mon Sep 17 00:00:00 2001 From: $aTyam Date: Wed, 13 Sep 2023 01:41:33 -0400 Subject: [PATCH 09/10] [TESTED]Improved random point generation logic Generating Random points from circle ( rather than Square) around ref_points. Better Explanations for random point generation. Whitespace fixes. --- .../TestGreedySimilarityBinning.py | 24 +++++++-------- .../modellingTests/modellingTestAssets.py | 29 ++++++++++++------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index 937effc94..b96147706 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -139,12 +139,12 @@ def testPrediction(self): destination=(1,1) ) model_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 500, # meters, - "apply_cutoff": False, - "clustering_way": 'origin_destination', - "incremental_evaluation": False - } + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "clustering_way": 'origin_destination', + "incremental_evaluation": False + } model= eamtg.GreedySimilarityBinning(model_config) train = trips[0:5] test = trips[5] @@ -177,12 +177,12 @@ def testNoPrediction(self): threshold=binning_threshold, within_threshold=n ) model_config = { - "metric": "od_similarity", - "similarity_threshold_meters": 500, # meters, - "apply_cutoff": False, - "clustering_way": 'origin_destination', - "incremental_evaluation": False - } + "metric": "od_similarity", + "similarity_threshold_meters": 500, # meters, + "apply_cutoff": False, + "clustering_way": 'origin_destination', + "incremental_evaluation": False + } model= eamtg.GreedySimilarityBinning(model_config) model.fit(train) results, n = model.predict(test[0]) diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 9ad662fe3..2e2fe8361 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -23,18 +23,26 @@ def generate_nearby_random_points(ref_coords,threshold): #convert given threshold in m to approx WGS84 coord dist. thresholdInWGS84 = threshold* (0.000001/0.11) - #generate a random coordinate in threshold's limit around the ref points. - dx=random.uniform(-thresholdInWGS84/2,thresholdInWGS84/2) - dy=random.uniform(-thresholdInWGS84/2,thresholdInWGS84/2) + #generate a random coordinate in threshold's limit around the ref points. OR we + # for eg, ref point is 0,0 and threshold is 100m , so we generate a radius from 0 to 50, say 34 + # in this example. A random radius is also generted from 0 to 360,say 0. We then take 34 step along x axis direction + # till radius length to get our new point, (34,0). When this function is called the next time to generate a point + #that has to be binned with previous one, we again generate r and theta , say 24 , 180 this time. + # Now this new point is at (-24,0). Both these points are within threshold (100 in this case)limit and therefore will + #be binned together. + radius=random.uniform(0,thresholdInWGS84/2) + theta=random.uniform(0,2*math.pi) + dx = radius * math.cos(theta) + dy = radius * math.sin (theta) #This basically gives a way to sample a point from within a square of length thresholdInWGS84 # around the ref. point. - return [ref_coords[0] +dx , ref_coords[1] +dy] + return [ref_coords[0] + dy , ref_coords[1] + dx] def generate_trip_coordinates( points_list: list[float], ref_coords, - InsideThreshold: bool, + insideThreshold: bool, threshold_meters: float, ) -> Tuple[float, float]: """generates trip coordinate data to use when mocking a set of trip data.i @@ -50,7 +58,7 @@ def generate_trip_coordinates( circle from some coordinates up to some threshold """ # if the point is to be generated within a threshold and it's not the first point - if InsideThreshold and points_list: + if insideThreshold and points_list: # if no ref. coordinates are provided, use any previously accepted point as ref. if ref_coords == None: ref_coords=random.choice(points_list) @@ -61,9 +69,10 @@ def generate_trip_coordinates( if ref_coords == None: new_point = generate_random_point() else: - # if ref coordinate are provided, use them as the starting point and iterate till required - # condition is satisfied + # if ref coordinate are provided, use them as the startisng point. new_point = ref_coords + # If the newly generated new_point ( be it when ref_coords given or not given) is not more + # than threshold_meters away from all the previously accepted points, keep generating new_point # while not all(ecc.calDistance(new_point, pt) > threshold_meters for pt in points_list): new_point = generate_random_point() return new_point @@ -226,8 +235,8 @@ def generate_mock_trips( # generate 'trip' number of points based on which among 'o' (Origin) ,'d' (Destination) or # 'od' (Origin-Destination) or '__' (None) should be in threshold proximity to each other. for within in trips_within_threshold: - origin_points.append(generate_trip_coordinates(origin_points, origin, InsideThreshold= (trip_part[0] == 'o' and within), threshold_meters= threshold)) - destination_points.append(generate_trip_coordinates(destination_points, destination, InsideThreshold=(trip_part[1] == 'd' and within), threshold_meters=threshold)) + origin_points.append(generate_trip_coordinates(origin_points, origin, insideThreshold= (trip_part[0] == 'o' and within), threshold_meters= threshold)) + destination_points.append(generate_trip_coordinates(destination_points, destination, insideThreshold=(trip_part[1] == 'd' and within), threshold_meters=threshold)) for o,d in zip(origin_points,destination_points): labels = {} if label_data is None or random.random() > has_label_p \ From 21305df9ce756a6e032064ca64a24e3185cd8b81 Mon Sep 17 00:00:00 2001 From: $aTyam Date: Thu, 14 Sep 2023 00:15:30 -0400 Subject: [PATCH 10/10] Minor fixes Comments and variable names fixed --- emission/tests/modellingTests/TestGreedySimilarityBinning.py | 4 ++-- emission/tests/modellingTests/modellingTestAssets.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py index b96147706..31b3261ae 100644 --- a/emission/tests/modellingTests/TestGreedySimilarityBinning.py +++ b/emission/tests/modellingTests/TestGreedySimilarityBinning.py @@ -113,9 +113,9 @@ def testBinning(self): model = eamtg.GreedySimilarityBinning(model_config) model.fit(trips) #check each bins for no of trips - at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values())) + one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values())) #Since 5 trips were sampled within the threshold, there should be one bin with 5 trips - self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it") + self.assertTrue(one_large_bin, "one bin should have 5 features in it") def testPrediction(self): """ diff --git a/emission/tests/modellingTests/modellingTestAssets.py b/emission/tests/modellingTests/modellingTestAssets.py index 2e2fe8361..252b2ad34 100644 --- a/emission/tests/modellingTests/modellingTestAssets.py +++ b/emission/tests/modellingTests/modellingTestAssets.py @@ -35,7 +35,7 @@ def generate_nearby_random_points(ref_coords,threshold): theta=random.uniform(0,2*math.pi) dx = radius * math.cos(theta) dy = radius * math.sin (theta) - #This basically gives a way to sample a point from within a square of length thresholdInWGS84 + #This basically gives a way to sample a point from within a circle of radius thresholdInWGS84/2 # around the ref. point. return [ref_coords[0] + dy , ref_coords[1] + dx]