Improvised Testcases for edge cases + Refactored logic

Code Logic - Updated the code to return counts from both original_timeseries and analysis_timeseries as a tuple of lists pertaining to each key for each timeseries db. Test cases: - Updated test cases to include counts from both DBs. - Included new test cases for edge conditions. - Test cases include: 1. Key_list of keys from both DBs 2. Key_list of keys from Original timeseries DB. 3. Key_list of keys from Analysis timeseries DB. 4. Empty key_list 5. Invalid keys 6. Aggregate timeseries DB data as input. 7. New user without any data
e-mission · Sep 5, 2023 · 821478c · 821478c
1 parent bb88af4
commit 821478c
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 17 deletions.
diff --git a/emission/storage/timeseries/builtin_timeseries.py b/emission/storage/timeseries/builtin_timeseries.py
@@ -440,18 +440,63 @@ def update_data(user_id, key, obj_id, data):
         logging.debug("updating entry %s into timeseries" % new_entry)
         edb.save(ts.get_timeseries_db(key), new_entry)
 
-    def find_entries_count(self, key, time_query = None, geo_query = None, extra_query_list = None):
+    def find_entries_count(self, key_list = None, time_query = None, geo_query = None, extra_query_list = None):
         """
-        Returns the total number of documents for the specific key referring to a timeseries db.
-        :param key: the metadata key we are querying for. Only supports one key for now.
+        Returns the total number of documents for the given key_list referring to each of the two timeseries db.
+
+        Input: Key list with keys from both timeseries DBs = [key1, key2, key3, key4, ...]
+                Suppose (key1, key2) are orig_tsdb keys and (key3, key4) are analysis_tsdb keys
+        Output: Tuple of lists  = (orig_tsdb_count, analysis_tsdb_count)
+                                = ([count_key1, count_key2, ...], [count_key3, count_key4, ...])
+                Orig_tsdb_count and Analysis_tsdb_count are lists containing counts of matching documents 
+                for each key considered separately for the specific timeseries DB.
+
+        :param key_list: list of metadata keys we are querying for.
         :param time_query: the time range in which to search the stream
         :param geo_query: the query for a geographical area
         :param extra_query_list: any additional queries to filter out data
+
+        For key_list = None, total count of all documents are returned for each of the matching timeseries DBs.
         """
         logging.debug("builtin_timeseries.find_entries_count() called")
-        created_query = self._get_query([key], time_query, geo_query, extra_query_list)
-        result_dataset = self.get_timeseries_db(key)
-        total_entries = result_dataset.count_documents(created_query)
-        return total_entries
+
+        orig_tsdb = self.timeseries_db
+        analysis_tsdb = self.analysis_timeseries_db
+
+        orig_tsdb_counts = []
+        analysis_tsdb_counts = []
+
+        if key_list == [] or key_list is None:
+            key_list = None
+
+        # Segregate orig_tsdb and analysis_tsdb keys
+        (orig_tsdb_keys, analysis_tsdb_keys) = self._split_key_list(key_list)
+
+        orig_tsdb_counts = self._get_entries_counts_for_timeseries(orig_tsdb, orig_tsdb_keys, time_query, geo_query, extra_query_list)
+        analysis_tsdb_counts = self._get_entries_counts_for_timeseries(analysis_tsdb, analysis_tsdb_keys, time_query, geo_query, extra_query_list)
+
+        return (orig_tsdb_counts, analysis_tsdb_counts)
+
+
+    def _get_entries_counts_for_timeseries(self, tsdb, key_list, time_query, geo_query, extra_query_list):
+
+        tsdb_queries = []
+        tsdb_counts = []
+
+        # For each key in orig_tsdb keys, create a query
+        if key_list is not None:
+            for key in key_list:
+                tsdb_query = self._get_query([key], time_query, geo_query, extra_query_list)
+                tsdb_queries.append(tsdb_query)
+            # For each query generated for each orig_tsdb key, fetch count of matching documents
+            for query in tsdb_queries:
+                entries_count = tsdb.count_documents(query)
+                tsdb_counts.append(entries_count)
+        else:
+            tsdb_queries = self._get_query(key_list, time_query, geo_query, extra_query_list)
+            entries_count = tsdb.count_documents(tsdb_queries)
+            tsdb_counts = [entries_count]
+
+        return tsdb_counts      
 
 
diff --git a/emission/tests/storageTests/TestTimeSeries.py b/emission/tests/storageTests/TestTimeSeries.py
@@ -84,24 +84,76 @@ def testExtraQueries(self):
     def testFindEntriesCount(self):
         '''
         Test: Specific keys with other parameters not passed values.
-        Input: For each dataset: ["background/location", "background/filtered_location]
+        Input: For each dataset: ["background/location", "background/filtered_location", "analysis/confirmed_trip"]
             - Testing this with sample dataset: "shankari_2015-aug-21", "shankari_2015-aug-27"
-        Output: Aug_21: [738, 508], Aug_27: [555, 327]
+        Output: Aug_21: ([738, 508], [0]), Aug_27: ([555, 327], [0])
             - Actual output just returns a single number for count of entries.
             - Validated using grep count of occurrences for keys: 1) "background/location"     2) "background/filtered_location"
                 - $ grep -c <key> <dataset>.json
+        
+        For Aggregate Timeseries test case:
+        - UUID('e66d0a3a-4316-4d9d-ac66-ee3754081d09') is returned as only distinct user which is stored in monogDB as BinData datatype.
+        - Validated the count of documents for the keys using mongo DB access via terminal.
+        - Ran these queries inside mongo terminal to get the counts:
+        $ db.Stage_timeseries.find({$and: [{"user_id" : BinData(3,"5m0KOkMWTZ2sZu43VAgdCQ==")}, {"metadata.key" : "background/location"}]}).count()
+        $ db.Stage_timeseries.find({$and: [{"user_id" : BinData(3,"5m0KOkMWTZ2sZu43VAgdCQ==")}, {"metadata.key" : "background/filtered_location"}]}).count()
+        $ db.Stage_analysis_timeseries.find({$and: [{"user_id" : BinData(3,"5m0KOkMWTZ2sZu43VAgdCQ==")}, {"metadata.key" : "analysis/confirmed_trip"}]}).count() 
+
+        - The counts returned were 1476, 1016, 5, respectively.
         '''
+
         ts1_aug_21 = esta.TimeSeries.get_time_series(self.testUUID1)
         ts2_aug_27 = esta.TimeSeries.get_time_series(self.testUUID)
 
-        count_ts1 = [ts1_aug_21.find_entries_count(key="background/location"), ts1_aug_21.find_entries_count(key="background/filtered_location")]
-        print("\nEntry counts for location, filtered_location on {} = {}".format("Aug_21", count_ts1))
-        self.assertEqual(count_ts1, [738, 508])
-
-        count_ts2 = [ts2_aug_27.find_entries_count(key="background/location"), ts2_aug_27.find_entries_count(key="background/filtered_location")]
-        print("Entry counts for location, filtered_location on {} = {}".format("Aug_27", count_ts2))
-        self.assertEqual(count_ts2, [555, 327])
-
+        # Test case: Combination of original and analysis timeseries DB keys for Aug-21 dataset
+        key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
+        count_ts1 = ts1_aug_21.find_entries_count(key_list=key_list1)
+        self.assertEqual(count_ts1, ([738, 508], [0]))
+
+        # Test case: Combination of original and analysis timeseries DB keys for Aug-27 dataset
+        key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
+        count_ts2 = ts2_aug_27.find_entries_count(key_list=key_list1)
+        self.assertEqual(count_ts2, ([555, 327], [0]))
+
+        # Test case: Only original timeseries DB keys for Aug-27 dataset
+        key_list2=["background/location", "background/filtered_location"]
+        count_ts3 = ts2_aug_27.find_entries_count(key_list=key_list2)
+        self.assertEqual(count_ts3, ([555, 327], []))
+
+        # Test case: Only analysis timeseries DB keys
+        key_list3=["analysis/confirmed_trip"]
+        count_ts4 = ts2_aug_27.find_entries_count(key_list=key_list3)
+        self.assertEqual(count_ts4, ([], [0]))
+
+        # Test case: Empty key_list which should return total count of all documents in the two DBs
+        key_list4=[]
+        count_ts5 = ts1_aug_21.find_entries_count(key_list=key_list4)
+        self.assertEqual(count_ts5, ([2125], [0]))
+
+        # Test case: Invalid or unmatched key in metadata field 
+        key_list5=["randomxyz_123test"]
+        with self.assertRaises(KeyError) as ke:
+            count_ts6 = ts1_aug_21.find_entries_count(key_list=key_list5)
+        self.assertEqual(str(ke.exception), "'randomxyz_123test'")
+
+        # Test case: Aggregate timeseries DB User data passed as input
+        ts_agg = esta.TimeSeries.get_aggregate_time_series()
+        users_distinct = ts_agg.get_distinct_users()
+        for uuid in users_distinct:
+            ts_user_ag = esta.TimeSeries.get_time_series(uuid)
+            count_ts7 = ts_user_ag.find_entries_count(key_list=key_list1)
+            self.assertEqual(count_ts7, ([1476, 1016], [5]))
+
+        # Test case: New User created with no data to check
+        self.testEmail = None
+        self.testUUID2 = self.testUUID
+        etc.createAndFillUUID(self)
+        ts_new_user = esta.TimeSeries.get_time_series(self.testUUID)
+        count_ts8 = ts_new_user.find_entries_count(key_list=key_list1)
+        self.assertEqual(count_ts8, ([0, 0], [0]))
+        self.testUUID = self.testUUID2
+        self.testEmail = "user2"
+
         print("Assert Test for Count Data successful!")