Integration Testing for forest model

The changes in this iteration are improvements in test for forest model : 1. Post discussion last week, the regression test was removed ( `TestForestModel.py` )since it won't be useful when model performance improves. Rather, the structures of predictions is checked. This check is merged with TestForestModel.py 2. After #944 , `predict_labels_with_n` in `run_model.py` expectes a lists and then iterates over it. The forest model and rest of the tests were updated accordingly.
e-mission · Feb 5, 2024 · 104dd9a · 104dd9a
1 parent 052cb08
commit 104dd9a
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 172 deletions.
diff --git a/emission/analysis/modelling/trip_model/forest_classifier.py b/emission/analysis/modelling/trip_model/forest_classifier.py
@@ -103,13 +103,11 @@ def predict(self, trip: List[float]) -> Tuple[List[Dict], int]:
         #check if theres no trip to predict        
         logging.debug(f"forest classifier predict called with {len(trip)} trips")
         if len(trip) == 0:
-            msg = f'model.predict cannot be called with an empty trips'
+            msg = f'model.predict cannot be called with an empty trip'
             raise Exception(msg)        
-        # CONVERT LIST OF TRIPS TO dataFrame 
-        test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",trip)
-        labeled_trip_df = esdtq.filter_labeled_trips(test_df)
-        expanded_labeled_trip_df= esdtq.expand_userinputs(labeled_trip_df)
-        predcitions_df= self.model.predict(expanded_labeled_trip_df)
+        # CONVERT TRIP TO dataFrame        
+        test_df = estb.BuiltinTimeSeries.to_data_df("analysis/confirmed_trip",[trip])
+        predcitions_df= self.model.predict(test_df)
 
         # the predictions_df currently holds the highest probable options
         # individually in all three categories. the predictions_df are in the form 

diff --git a/emission/tests/modellingTests/TestForestModel.py b/emission/tests/modellingTests/TestForestModel.py
diff --git a/emission/tests/modellingTests/TestForestModelIntegration.py b/emission/tests/modellingTests/TestForestModelIntegration.py
@@ -13,16 +13,87 @@
 import emission.pipeline.intake_stage as epi
 import logging
 
-class TestLabelInferencePipeline(unittest.TestCase):
-    # It is important that these functions be deterministic
-
+import emission.analysis.modelling.trip_model.run_model as eamur
+import emission.analysis.modelling.trip_model.model_type as eamumt
+import emission.analysis.modelling.trip_model.model_storage as eamums
+import emission.tests.modellingTests.modellingTestAssets as etmm
+import emission.storage.timeseries.abstract_timeseries as esta
+
+
+class TestForestModelIntegration(unittest.TestCase):
+    # Test if the forest model for label prediction is smoothly integrated with the inference pipeline.
+    # In the initial setup, build a dummy forest model. Then run the pipeline on real example data.
+    # Finally in the test, assert the type of label predictions expected.
 
     def setUp(self):
 
         self.reset_all()
         np.random.seed(91)
         self.test_algorithms = eacilp.primary_algorithms
+
+        forest_model_config= {
+            "loc_feature" : "coordinates",
+            "radius": 500,
+            "size_thresh":1,
+            "purity_thresh":1.0,
+            "gamma":0.05,
+            "C":1,
+            "n_estimators":100,
+            "criterion":"gini",
+            "max_depth":'null',
+            "min_samples_split":2,
+            "min_samples_leaf":1,
+            "max_features":"sqrt",
+            "bootstrap":True,
+            "random_state":42,
+            "use_start_clusters":False,
+            "use_trip_clusters":True
+        }
         etc.setupRealExample(self, "emission/tests/data/real_examples/shankari_2015-07-22")  ##maybe use a different file
+        ts = esta.TimeSeries.get_time_series(self.testUUID)
+        label_data = {
+            "mode_confirm": ['ebike', 'bike'],
+            "purpose_confirm": ['happy-hour', 'dog-park'],
+            "replaced_mode": ['walk'],
+            "mode_weights": [0.9, 0.1],
+            "purpose_weights": [0.1, 0.9]
+        }
+
+        self.origin = (-105.1705977, 39.7402654,)
+        self.destination = (-105.1755606, 39.7673075)
+        self.min_trips = 14
+        self.total_trips = 100
+        self.clustered_trips = 33
+        self.has_label_percent = 0.9
+        ## generate mock trips
+        train = etmm.generate_mock_trips(
+            user_id=self.testUUID,
+            trips=self.total_trips,
+            origin=self.origin,
+            destination=self.destination,
+            trip_part='od',
+            label_data=label_data,
+            within_threshold=self.clustered_trips,  
+            threshold=0.004, # ~400m
+            has_label_p=self.has_label_percent
+        )
+        ts.bulk_insert(train)
+        # confirm data write did not fail
+        check_data = esda.get_entries(key="analysis/confirmed_trip", user_id=self.testUUID, time_query=None)
+        if len(check_data) != self.total_trips:
+            logging.debug(f'test invariant failed after generating test data')
+            self.fail()
+        else:
+            logging.debug(f'found {self.total_trips} trips in database')
+        ## Build an already existing model or a new model
+        eamur.update_trip_model(
+            user_id=self.testUUID,
+            model_type=eamumt.ModelType.RANDOM_FOREST_CLASSIFIER,
+            model_storage=eamums.ModelStorage.DOCUMENT_DATABASE,
+            min_trips=4,
+            model_config=forest_model_config
+        )
+        ## run inference pipeline
         self.run_pipeline(self.test_algorithms)
         time_range = estt.TimeQuery("metadata.write_ts", None, time.time())
         self.inferred_trips = esda.get_entries(esda.INFERRED_TRIP_KEY, self.testUUID, time_query=time_range)
@@ -39,16 +110,19 @@ def run_pipeline(self, algorithms):
     def reset_all(self):
         etc.dropAllCollections(edb._get_current_db())
 
-    # Tests that algorithm being tested runs and saves to the database correctly
-    def testIndividualAlgorithms(self):
-        logging.debug('TEST1')
+    # Tests that forest algorithm being tested runs successfully
+    def testForestAlgorithm(self):
         for trip in self.inferred_trips:
             entries = esdt.get_sections_for_trip("inference/labels", self.testUUID, trip.get_id())
-            logging.debug(f"ENTRIES: {entries}")
             self.assertEqual(len(entries), len(self.test_algorithms))
-            # for entry in entries:
-            #     self.assertGreater(len(entry["data"]["prediction"]), 0)
-
+            for entry in entries:
+                self.assertGreater(len(entry["data"]["prediction"]), 0)
+                for singleprediction in entry["data"]["prediction"]:
+                    self.assertIsInstance(singleprediction, dict, " should be an instance of the dictionary class")
+                    self.assertIsInstance(singleprediction['labels'], dict, " should be an instance of the dictionary class")
+                    self.assertIn('mode_confirm',singleprediction['labels'].keys())
+                    self.assertIn('replaced_mode',singleprediction['labels'].keys())
+                    self.assertIn('purpose_confirm',singleprediction['labels'].keys())        
 
 def main():
     etc.configLogging()

diff --git a/emission/tests/modellingTests/TestForestModelLoadandSave.py b/emission/tests/modellingTests/TestForestModelLoadandSave.py
@@ -134,7 +134,7 @@ def testForestModelRoundTrip(self):
 #       logging.debug(f'Predictions on trips in database')
 
         predictions_list = eamur.predict_labels_with_n(
-            trip_list = [test],
+            trip_list = test,
             model=model            
         )
 
@@ -151,7 +151,7 @@ def testForestModelRoundTrip(self):
 
 #       logging.debug(f'Predictions on trips using deserialised model')
         predictions_loaded_model_list = eamur.predict_labels_with_n(
-                trip_list = [test],
+                trip_list = test,
                 model=deserialized_model           
         )
 #       logging.debug(f'Assert that both predictions are the same')
@@ -184,7 +184,7 @@ def testForestModelConsistency(self):
         # logging.debug(f' Model Predictions on trips in database')
 
         predictions_list_model1 = eamur.predict_labels_with_n(
-            trip_list = [test],
+            trip_list = test,
             model=model_iter1           
         )
         # logging.debug(f' Loading Model again')
@@ -197,7 +197,7 @@ def testForestModelConsistency(self):
             )
         # logging.debug(f' Model Predictions on trips in database')
         predictions_list_model2 = eamur.predict_labels_with_n(
-            trip_list = [test],
+            trip_list = test,
             model=model_iter2           
         )
 

diff --git a/emission/tests/modellingTests/TestRunForestModel.py b/emission/tests/modellingTests/TestRunForestModel.py
@@ -183,9 +183,16 @@ def test1RoundPredictForestModel(self):
             )
 
         predictions_list = eamur.predict_labels_with_n(
-            trip_list = [test],
+            trip_list = test,
             model=model            
         )
         for prediction, n in predictions_list:
             [logging.debug(p) for p in sorted(prediction, key=lambda r: r['p'], reverse=True)]
             self.assertNotEqual(len(prediction), 0, "should have a prediction")
+            self.assertIn('labels',prediction[0].keys())
+            self.assertIn('p',prediction[0].keys())
+            self.assertIsInstance(prediction[0], dict, " should be an instance of the dictionary class")
+            self.assertIsInstance(prediction[0]['labels'], dict, " should be an instance of the dictionary class")
+            self.assertIn('mode_confirm',prediction[0]['labels'].keys())
+            self.assertIn('replaced_mode',prediction[0]['labels'].keys())
+            self.assertIn('purpose_confirm',prediction[0]['labels'].keys())