Simplified count computation and updated returned values

- Reused existing functionality from builtin_timeseries.py to fetch computed count of entries - Corrected return values to return combined sum of count of matching entries not segregated by keys or timeseries dataset.
e-mission · Sep 8, 2023 · 825d4ce · 825d4ce
1 parent 0903cf7
commit 825d4ce
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 54 deletions.
diff --git a/emission/storage/timeseries/builtin_timeseries.py b/emission/storage/timeseries/builtin_timeseries.py
@@ -446,8 +446,8 @@ def find_entries_count(self, key_list = None, time_query = None, geo_query = Non
 
         Input: Key list with keys from both timeseries DBs = [key1, key2, key3, key4, ...]
                 Suppose (key1, key2) are orig_tsdb keys and (key3, key4) are analysis_tsdb keys
-        Output: Tuple of lists  = (orig_tsdb_count, analysis_tsdb_count)
-                                = ([count_key1, count_key2, ...], [count_key3, count_key4, ...])
+        Output: total_count = orig_tsdb_count + analysis_tsdb_count
+                            
                 Orig_tsdb_count and Analysis_tsdb_count are lists containing counts of matching documents 
                 for each key considered separately for the specific timeseries DB.
 
@@ -456,47 +456,23 @@ def find_entries_count(self, key_list = None, time_query = None, geo_query = Non
         :param geo_query: the query for a geographical area
         :param extra_query_list: any additional queries to filter out data
 
-        For key_list = None, total count of all documents are returned for each of the matching timeseries DBs.
+        For key_list = None or empty, total count of all documents are returned considering the matching entries from entire dataset.
         """
         logging.debug("builtin_timeseries.find_entries_count() called")
 
         orig_tsdb = self.timeseries_db
         analysis_tsdb = self.analysis_timeseries_db
-
-        orig_tsdb_counts = []
-        analysis_tsdb_counts = []
 
-        if key_list == [] or key_list is None:
+        if key_list == []:
             key_list = None
 
-        # Segregate orig_tsdb and analysis_tsdb keys
+        # Segregate orig_tsdb and analysis_tsdb keys so as to fetch counts on each dataset
         (orig_tsdb_keys, analysis_tsdb_keys) = self._split_key_list(key_list)
 
-        orig_tsdb_counts = self._get_entries_counts_for_timeseries(orig_tsdb, orig_tsdb_keys, time_query, geo_query, extra_query_list)
-        analysis_tsdb_counts = self._get_entries_counts_for_timeseries(analysis_tsdb, analysis_tsdb_keys, time_query, geo_query, extra_query_list)
-
-        return (orig_tsdb_counts, analysis_tsdb_counts)
-
-
-    def _get_entries_counts_for_timeseries(self, tsdb, key_list, time_query, geo_query, extra_query_list):
-
-        tsdb_queries = []
-        tsdb_counts = []
-
-        # For each key in orig_tsdb keys, create a query
-        if key_list is not None:
-            for key in key_list:
-                tsdb_query = self._get_query([key], time_query, geo_query, extra_query_list)
-                tsdb_queries.append(tsdb_query)
-            # For each query generated for each orig_tsdb key, fetch count of matching documents
-            for query in tsdb_queries:
-                entries_count = tsdb.count_documents(query)
-                tsdb_counts.append(entries_count)
-        else:
-            tsdb_queries = self._get_query(key_list, time_query, geo_query, extra_query_list)
-            entries_count = tsdb.count_documents(tsdb_queries)
-            tsdb_counts = [entries_count]
+        orig_tsdb_count = self._get_entries_for_timeseries(orig_tsdb, orig_tsdb_keys, time_query, geo_query, extra_query_list, None)[0]
+        analysis_tsdb_count = self._get_entries_for_timeseries(analysis_tsdb, analysis_tsdb_keys, time_query, geo_query, extra_query_list, None)[0]
 
-        return tsdb_counts      
+        total_matching_count = orig_tsdb_count + analysis_tsdb_count
+        return total_matching_count
 
 
diff --git a/emission/tests/storageTests/TestTimeSeries.py b/emission/tests/storageTests/TestTimeSeries.py
@@ -84,35 +84,40 @@ def testExtraQueries(self):
     def testFindEntriesCount(self):
         '''
         Test: Specific keys with other parameters not passed values.
-        Input: A set of keys from either of the two timeseries databases.
-        Output: A tuple of two lists (one for each timeseries database). Length of list depends on number of keys for that specific timeseries database.
 
-        Input: For each dataset: ["background/location", "background/filtered_location", "analysis/confirmed_trip"]
+        Input: A list of keys from either of the timeseries databases.
+            - For each dataset: ["background/location", "background/filtered_location", "analysis/confirmed_trip"]
             - Testing this with sample dataset: "shankari_2015-aug-21", "shankari_2015-aug-27"
-        Output: Aug_21: ([738, 508], [0]), Aug_27: ([555, 327], [0])
-            - Actual output just returns a single number for count of entries.
+            
+        Outputs: Single number representing total count of matching entries.
+            - For builtin_timeseries: Returns total count of all entries matching the userid. 
+            - For aggregate_timeseries: Returns total count of all entries matching all users.
+
             - Validated using grep count of occurrences for keys: 1) "background/location"     2) "background/filtered_location"    3) "analysis/confirmed_trip"
                 - Syntax: $ grep -c <key> <dataset>.json
                 - Sample: $ grep -c "background/location" emission/tests/data/real_examples/shankari_2015-aug-21
 
             - Grep Output Counts For Aug-21 dataset for each key:
                 1) background/location = 738,    2) background/filtered_location = 508,   3) analysis/confirmed_trip = 0
+                Hence total count = 738 + 508 + 0 = 1246
 
             - Grep Output Counts For Aug-27 dataset for each key:
                 1) background/location = 555,    2) background/filtered_location = 327,   3) analysis/confirmed_trip = 0
+                Hence total count = 555 + 327 + 0 = 882
         
         For Aggregate Timeseries test case:
-        - The expected output would be summed-up values for the respective keys from the individual users testing outputs mentioned above.
-        - Output: ([1293, 835], [0])
+
+        - Input: []
+        - Output: 3607
+            - 3607 = 2125 (UUID1) + 1482 (UUID2)
+
+        - Input: ["background/location", "background/filtered_location", "analysis/confirmed_trip"]
+        - Output: 2128
             - For each of the 3 input keys from key_list1: 
                 - 1293 = 738 (UUID1) + 555 (UUID2)
                 - 835 = 508 (UUID1) + 327 (UUID2)
                 - 0 = 0 (UUID1) + 0 (UUID2)
-
-        Empty/Blank keys
-        - Empty array is returned in case there were no keys pertaining to the respective timeseries database.
-        - This is to differentiate from the [0] case where a key might be present in the input but no matching documents found.
-        - Whereas in this case of [], no key was present in the input itself.
+            - Hence total count = 1293 + 835 + 0 = 2128
 
         '''
 
@@ -122,46 +127,51 @@ def testFindEntriesCount(self):
         # Test case: Combination of original and analysis timeseries DB keys for Aug-21 dataset
         key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
         count_ts1 = ts1_aug_21.find_entries_count(key_list=key_list1)
-        self.assertEqual(count_ts1, ([738, 508], [0]))
+        self.assertEqual(count_ts1, 1246)
 
         # Test case: Combination of original and analysis timeseries DB keys for Aug-27 dataset
         key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
         count_ts2 = ts2_aug_27.find_entries_count(key_list=key_list1)
-        self.assertEqual(count_ts2, ([555, 327], [0]))
+        self.assertEqual(count_ts2, 882)
 
         # Test case: Only original timeseries DB keys for Aug-27 dataset
         key_list2=["background/location", "background/filtered_location"]
         count_ts3 = ts2_aug_27.find_entries_count(key_list=key_list2)
-        self.assertEqual(count_ts3, ([555, 327], []))
+        self.assertEqual(count_ts3, 882)
 
         # Test case: Only analysis timeseries DB keys
         key_list3=["analysis/confirmed_trip"]
         count_ts4 = ts2_aug_27.find_entries_count(key_list=key_list3)
-        self.assertEqual(count_ts4, ([], [0]))
+        self.assertEqual(count_ts4, 0)
 
         # Test case: Empty key_list which should return total count of all documents in the two DBs
         key_list4=[]
         count_ts5 = ts1_aug_21.find_entries_count(key_list=key_list4)
-        self.assertEqual(count_ts5, ([2125], [0]))
+        self.assertEqual(count_ts5, 2125)
 
         # Test case: Invalid or unmatched key in metadata field 
         key_list5=["randomxyz_123test"]
         with self.assertRaises(KeyError) as ke:
             count_ts6 = ts1_aug_21.find_entries_count(key_list=key_list5)
         self.assertEqual(str(ke.exception), "'randomxyz_123test'")
 
-        # Test case: Aggregate timeseries DB User data passed as input
+        # Test case: Aggregate timeseries DB User data passed as input with non-empty key_list
         ts_agg = esta.TimeSeries.get_aggregate_time_series()
         count_ts7 = ts_agg.find_entries_count(key_list=key_list1)
-        self.assertEqual(count_ts7, ([1293, 835], [0]))
+        self.assertEqual(count_ts7, 2128)
+
+        # Test case: Aggregate timeseries DB User data passed as input with empty key_list
+        ts_agg = esta.TimeSeries.get_aggregate_time_series()
+        count_ts8 = ts_agg.find_entries_count(key_list=key_list4)
+        self.assertEqual(count_ts8, 3607)
 
         # Test case: New User created with no data to check
         self.testEmail = None
         self.testUUID2 = self.testUUID
         etc.createAndFillUUID(self)
         ts_new_user = esta.TimeSeries.get_time_series(self.testUUID)
-        count_ts8 = ts_new_user.find_entries_count(key_list=key_list1)
-        self.assertEqual(count_ts8, ([0, 0], [0]))
+        count_ts9 = ts_new_user.find_entries_count(key_list=key_list1)
+        self.assertEqual(count_ts9, 0)
 
         print("Assert Test for Count Data successful!")