e-mission · shankari · Sep 14, 2023 · Aug 20, 2023 · Aug 24, 2023 · Aug 24, 2023
diff --git a/emission/analysis/modelling/similarity/od_similarity.py b/emission/analysis/modelling/similarity/od_similarity.py
@@ -15,7 +15,28 @@ class OriginDestinationSimilarity(eamss.SimilarityMetric):
     def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]:
         return ctfe.od_features(trip)
 
-    def similarity(self, a: List[float], b: List[float]) -> List[float]:
-        o_dist = ecc.calDistance([a[0], a[1]], [b[0], b[1]])
-        d_dist = ecc.calDistance([a[2], a[3]], [b[2], b[3]])
-        return [o_dist, d_dist]
+    def similarity(self, a: List[float], b: List[float], clustering_way='origin-destination') -> List[float]:
+        """
+        a : a list of point features that takes the forms
+          [point1_longitude,point1_latitude,point2_longitude,point2_latitude] 
+
+        b : a list of point features that takes the forms
+          [point1_longitude,point1_latitude,point2_longitude,point2_latitude] 
+
+        clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value.
+                         tells the part of the trip to be used for binning trips together if that 
+                         part lies within threshold.
+
+        return: a list of size 1 ([distance between point1-point3]) if a and b take form 1
+                or of size 2 ([distance between point1-point3, distance between point2-point4])
+                if a and b take form 2.
+        """
+        origin_dist = ecc.calDistance(a[0:2], b[0:2])
+        destination_dist=ecc.calDistance(a[2:4], b[2:4])
+
+        if clustering_way == 'origin-destination':
+            return [origin_dist,destination_dist]
+        elif clustering_way == 'origin':
+            return [origin_dist]
+        else:
+            return [destination_dist]
diff --git a/emission/analysis/modelling/similarity/similarity_metric.py b/emission/analysis/modelling/similarity/similarity_metric.py
@@ -17,25 +17,32 @@ def extract_features(self, trip: ecwc.Confirmedtrip) -> List[float]:
         pass
 
     @abstractmethod
-    def similarity(self, a: List[float], b: List[float]) -> List[float]:
+    def similarity(self, a: List[float], b: List[float], clustering_way = 'origin-destination') -> List[float]:
         """compares the features, producing their similarity
         as computed by this similarity metric
 
         :param a: features for a trip
         :param b: features for another trip
+        :param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value.
+                                tells the part of the trip to be used for binning trips together if that 
+                                part lies within a threshold.
         :return: for each feature, the similarity of these features
         """
         pass
 
-    def similar(self, a: List[float], b: List[float], thresh: float) -> bool:
+    def similar(self, a: List[float], b: List[float], thresh: float, clustering_way= 'origin-destination') -> bool:
         """compares the features, returning true if they are similar
         within some threshold
 
-        :param a: features for a trip
+        :param a: features for a trip 
         :param b: features for another trip
         :param thresh: threshold for similarity
+        :param clustering_way : takes one among 'origin', 'destination', 'origin-destination' as value.
+                                tells the part of the trip to be used for binning trips together if that 
+                                part lies within a threshold.
         :return: true if the feature similarity is within some threshold
         """
-        similarity_values = self.similarity(a, b)
-        is_similar = all(map(lambda sim: sim <= thresh, similarity_values))
+        similarity_values = self.similarity(a, b, clustering_way)
+        is_similar = all(sim <= thresh for sim in similarity_values)
+
         return is_similar
diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py
@@ -119,6 +119,11 @@ class label to apply:
         self.sim_thresh = config['similarity_threshold_meters']
         self.apply_cutoff = config['apply_cutoff']
         self.is_incremental = config['incremental_evaluation']
+        if config.get('clustering_way') is None:
+            self.clusteringWay='origin-destination'   # previous default
+        else:
+            self.clusteringWay= config['clustering_way'] 
+        self.tripLabels=[]
 
         self.bins: Dict[str, Dict] = {}
 
@@ -184,9 +189,11 @@ def _assign_bins(self, trips: List[ecwc.Confirmedtrip]):
                 logging.debug(f"adding trip to bin {bin_id} with features {trip_features}")
                 self.bins[bin_id]['feature_rows'].append(trip_features)
                 self.bins[bin_id]['labels'].append(trip_labels)
+                self.tripLabels.append(bin_id)
             else:
                 # create new bin
                 new_bin_id = str(len(self.bins))
+                self.tripLabels.append(new_bin_id)
                 new_bin_record = {
                     'feature_rows': [trip_features],
                     'labels': [trip_labels],
@@ -200,14 +207,15 @@ def _find_matching_bin_id(self, trip_features: List[float]) -> Optional[str]:
         finds an existing bin where all bin features are "similar" to the incoming
         trip features.
 
-        :param trip_features: feature row for the incoming trip
+        :param trip_features: feature row for the incoming trip. 
+                            takes the form [orig_lat, orig_lon, dest_lat, dest_lon]
         :return: the id of a bin if a match was found, otherwise None
         """
         for bin_id, bin_record in self.bins.items():
-                matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh)
-                    for bin_sample in bin_record['feature_rows']])
-                if matches_bin:
-                    return bin_id
+            matches_bin = all([self.metric.similar(trip_features, bin_sample, self.sim_thresh,self.clusteringWay)
+                for bin_sample in bin_record['feature_rows']])
+            if matches_bin:
+                return bin_id
         return None
 
     def _nearest_bin(self, trip: ecwc.Confirmedtrip) -> Tuple[Optional[int], Optional[Dict]]:

diff --git a/emission/tests/modellingTests/TestBackwardsCompat.py b/emission/tests/modellingTests/TestBackwardsCompat.py
@@ -59,6 +59,7 @@ def testAnyVsAllWhilePredicting(self):
             "metric": "od_similarity",
             "similarity_threshold_meters": 16000,      # meters,
             "apply_cutoff": False,
+            "clustering_way": 'origin-destination',
             "incremental_evaluation": False
         }
         new_builder = eamtg.GreedySimilarityBinning(model_config)
@@ -96,6 +97,7 @@ def testRandomTripsWithinTheSameThreshold(self):
             trips=n, 
             origin=(0, 0), 
             destination=(1, 1), 
+            trip_part='od',
             label_data=label_data, 
             threshold=0.001,  # ~ 111 meters in degrees WGS84
         )
@@ -113,6 +115,7 @@ def testRandomTripsWithinTheSameThreshold(self):
             "metric": "od_similarity",
             "similarity_threshold_meters": 500,      # meters,
             "apply_cutoff": False,
+            "clustering_way": 'origin-destination',
             "incremental_evaluation": False
         }
         new_model = eamtg.GreedySimilarityBinning(model_config)
@@ -156,6 +159,7 @@ def testRandomTripsOutsideTheSameThreshold(self):
             trips=n, 
             origin=(0, 0), 
             destination=(1, 1), 
+            trip_part='od', 
             label_data=label_data, 
             threshold=0.1,  # Much bigger than the 500m threshold, so we will get multiple bins
         )
@@ -173,6 +177,7 @@ def testRandomTripsOutsideTheSameThreshold(self):
             "metric": "od_similarity",
             "similarity_threshold_meters": 500,      # meters,
             "apply_cutoff": False,
+            "clustering_way": 'origin-destination',
             "incremental_evaluation": False
         }
         new_model = eamtg.GreedySimilarityBinning(model_config)

diff --git a/emission/tests/modellingTests/TestGreedySimilarityBinning.py b/emission/tests/modellingTests/TestGreedySimilarityBinning.py
@@ -1,6 +1,7 @@
 import unittest
 import emission.analysis.modelling.trip_model.greedy_similarity_binning as eamtg
 import emission.tests.modellingTests.modellingTestAssets as etmm
+
 import logging
 
 
@@ -10,44 +11,111 @@ def setUp(self) -> None:
         logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
         level=logging.DEBUG)
 
-    def testBinning(self):
+    def testNoBinning(self):
         """
-        when $should_be_grouped trips are the same, they should appear in a bin
+        Tests the three (origin, destination and origin-destination based) 
+        binning configuration for trips.
+
+        When the origin and destination points of trips are outside a threshold
+        limit, none of the trips should be binned with the other in any of the three 
+        configs (origin, destination or origin-and-destination based).       
         """
+
+        # generate $n trips.
+        n = 20   
+        binning_threshold=500
+        #this generates 20 trips one-by-one, where each trip's respective origin and destination 
+        # points are more than 500m away.
+
+
         label_data = {
             "mode_confirm": ['walk', 'bike', 'transit'],
             "purpose_confirm": ['work', 'home', 'school'],
             "replaced_mode": ['drive']
-        }
+        }         
+
+
+        trips =etmm.generate_mock_trips(
+                user_id="joe", 
+                trips=n, 
+                trip_part='__',
+                label_data=label_data, 
+                within_threshold=1, 
+                threshold=binning_threshold,
+                origin=(0,0),
+                destination=(1,1)
+            )
+
+        # parameters passed for testing. A list, where each element is one way of clustering
+        clustering_ways_paramters= ["origin","destination","origin-destination"]
+
+        #Testing each of the three clustering_ways by passing them as parameters
+        for cw in clustering_ways_paramters:
+            with self.subTest(clustering_way=cw):
+                #initialise the binning model and fit with previously generated trips
+                model_config = {
+                                    "metric": "od_similarity",
+                                    "similarity_threshold_meters": binning_threshold,  # meters,
+                                    "apply_cutoff": False,
+                                    "clustering_way": cw,  
+                                    "incremental_evaluation": False
+                                }
+                model= eamtg.GreedySimilarityBinning(model_config)
+                model.fit(trips)
+                #check each bins for no of trips
+                no_large_bin = all(map(lambda b: len(b['feature_rows']) == 1, model.bins.values()))
+                #Since all trips were sampled outside the threshold, there should be no bin
+                # with more then 1 trip
+                self.assertTrue(no_large_bin,"no bin should have more than 1 features in it")
 
-        # generate $n trips. $m of them should have origin and destinations sampled
+    def testBinning(self):
+        """
+        Tests the three (origin, destination and origin-destination based) 
+        binning configuration for trips.
+
+        When the points lie within threshold ,the trips are binned together.
+        """
+        # generate $n trips. $m of them should have origin sampled
         # within a radius that should have them binned.
         n = 20
         m = 5
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            within_threshold=m, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-
-        # pass in a test configuration to the binning algorithm
-        model_config = {
-            "metric": "od_similarity",
-            "similarity_threshold_meters": 500,  # meters,
-            "apply_cutoff": False,
-            "incremental_evaluation": False
+        binning_threshold=500
+        label_data = {
+            "mode_confirm": ['walk', 'bike', 'transit'],
+            "purpose_confirm": ['work', 'home', 'school'],
+            "replaced_mode": ['drive']
         }
-        model = eamtg.GreedySimilarityBinning(model_config)
-
-        model.fit(trips)
 
-        # $m trip features should appear together in one bin
-        at_least_one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values()))
-        self.assertTrue(at_least_one_large_bin, "at least one bin should have at least 5 features in it")
+        # parameters passed for testing. A list, where each element of this list takes the form 
+        # [trip part to be sampled within mentioned threshold , clustering way used to check similarity]
+        parameters= [["o_",'origin'],["_d",'destination'],["od",'origin-destination']]
+        for tp,cw in parameters:
+            with self.subTest(trip_part=tp,clustering_way=cw):
+                #generate random trips using utilities
+                trips =etmm.generate_mock_trips(
+                    user_id="joe", 
+                    trips=n, 
+                    trip_part=tp,
+                    label_data=label_data, 
+                    within_threshold=m, 
+                    threshold=binning_threshold,
+                    origin=(0,0),
+                    destination=(1,1)
+                )
+                #initialise the binning model and fit with previously generated trips
+                model_config = {
+                            "metric": "od_similarity" ,
+                            "similarity_threshold_meters": binning_threshold,  # meters,
+                            "apply_cutoff": False,
+                            "clustering_way": cw,  
+                            "incremental_evaluation": False
+                 }
+                model = eamtg.GreedySimilarityBinning(model_config)
+                model.fit(trips)
+                #check each bins for no of trips
+                one_large_bin = any(map(lambda b: len(b['feature_rows']) == m, model.bins.values()))
+                #Since 5 trips were sampled within the threshold, there should be one bin with 5 trips
+                self.assertTrue(one_large_bin, "one bin should have 5 features in it")
 
     def testPrediction(self):
         """
@@ -60,23 +128,24 @@ def testPrediction(self):
         }
 
         n = 6
-        trips = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(0, 0), 
-            destination=(1, 1), 
-            label_data=label_data, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
-        )
-
+        trips =etmm.generate_mock_trips(
+                user_id="joe", 
+                trips=n, 
+                trip_part='od',
+                label_data=label_data, 
+                within_threshold=n, 
+                threshold=500,
+                origin=(0,0),
+                destination=(1,1)
+            )
         model_config = {
             "metric": "od_similarity",
-            "similarity_threshold_meters": 500,      # meters,
+            "similarity_threshold_meters": 500, # meters,
             "apply_cutoff": False,
+            "clustering_way": 'origin_destination',
             "incremental_evaluation": False
         }
-        model = eamtg.GreedySimilarityBinning(model_config)
-
+        model= eamtg.GreedySimilarityBinning(model_config)
         train = trips[0:5]
         test = trips[5]
 
@@ -95,33 +164,26 @@ def testNoPrediction(self):
             "purpose_confirm": ['pizza_party'],
             "replaced_mode": ['crabwalking']
         }
-
         n = 5
-        train = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=n, 
-            origin=(39.7645187, -104.9951944),       # Denver, CO
-            destination=(39.7435206, -105.2369292),  # Golden, CO
-            label_data=label_data, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        binning_threshold = 500
+        train = etmm.generate_mock_trips( user_id="joe",trips=n, origin=(39.7645187, -104.9951944), # Denver, CO
+                                   destination=(39.7435206, -105.2369292),  # Golden, CO
+                                   trip_part='od', label_data=label_data,
+                                   threshold=binning_threshold, within_threshold=n
         )
-        test = etmm.generate_mock_trips(
-            user_id="joe", 
-            trips=1, 
-            origin=(61.1042262, -150.5611644),       # Anchorage, AK
-            destination=(62.2721466, -150.3233046),  # Talkeetna, AK
-            label_data=label_data, 
-            threshold=0.001,  # ~ 111 meters in degrees WGS84
+        test = etmm.generate_mock_trips( user_id="amanda",trips=n, origin=(61.1042262, -150.5611644), # Denver, CO
+                                   destination=(62.2721466, -150.3233046),  # Golden, CO
+                                   trip_part='od', label_data=label_data,                                   
+                                    threshold=binning_threshold, within_threshold=n
         )
-
         model_config = {
             "metric": "od_similarity",
-            "similarity_threshold_meters": 500,      # meters,
+            "similarity_threshold_meters": 500,  # meters,
             "apply_cutoff": False,
+            "clustering_way": 'origin_destination',  
             "incremental_evaluation": False
         }
-        model = eamtg.GreedySimilarityBinning(model_config)
-
+        model= eamtg.GreedySimilarityBinning(model_config)
         model.fit(train)
         results, n = model.predict(test[0])