Skip to content

Commit

Permalink
Simplified count computation and updated returned values
Browse files Browse the repository at this point in the history
- Reused existing functionality from builtin_timeseries.py to fetch computed count of entries
- Corrected return values to return combined sum of count of matching entries not segregated by keys or timeseries dataset.
  • Loading branch information
Mahadik, Mukul Chandrakant authored and Mahadik, Mukul Chandrakant committed Sep 8, 2023
1 parent 0903cf7 commit 825d4ce
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 54 deletions.
42 changes: 9 additions & 33 deletions emission/storage/timeseries/builtin_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,8 @@ def find_entries_count(self, key_list = None, time_query = None, geo_query = Non
Input: Key list with keys from both timeseries DBs = [key1, key2, key3, key4, ...]
Suppose (key1, key2) are orig_tsdb keys and (key3, key4) are analysis_tsdb keys
Output: Tuple of lists = (orig_tsdb_count, analysis_tsdb_count)
= ([count_key1, count_key2, ...], [count_key3, count_key4, ...])
Output: total_count = orig_tsdb_count + analysis_tsdb_count
Orig_tsdb_count and Analysis_tsdb_count are lists containing counts of matching documents
for each key considered separately for the specific timeseries DB.
Expand All @@ -456,47 +456,23 @@ def find_entries_count(self, key_list = None, time_query = None, geo_query = Non
:param geo_query: the query for a geographical area
:param extra_query_list: any additional queries to filter out data
For key_list = None, total count of all documents are returned for each of the matching timeseries DBs.
For key_list = None or empty, total count of all documents are returned considering the matching entries from entire dataset.
"""
logging.debug("builtin_timeseries.find_entries_count() called")

orig_tsdb = self.timeseries_db
analysis_tsdb = self.analysis_timeseries_db

orig_tsdb_counts = []
analysis_tsdb_counts = []

if key_list == [] or key_list is None:
if key_list == []:
key_list = None

# Segregate orig_tsdb and analysis_tsdb keys
# Segregate orig_tsdb and analysis_tsdb keys so as to fetch counts on each dataset
(orig_tsdb_keys, analysis_tsdb_keys) = self._split_key_list(key_list)

orig_tsdb_counts = self._get_entries_counts_for_timeseries(orig_tsdb, orig_tsdb_keys, time_query, geo_query, extra_query_list)
analysis_tsdb_counts = self._get_entries_counts_for_timeseries(analysis_tsdb, analysis_tsdb_keys, time_query, geo_query, extra_query_list)

return (orig_tsdb_counts, analysis_tsdb_counts)


def _get_entries_counts_for_timeseries(self, tsdb, key_list, time_query, geo_query, extra_query_list):

tsdb_queries = []
tsdb_counts = []

# For each key in orig_tsdb keys, create a query
if key_list is not None:
for key in key_list:
tsdb_query = self._get_query([key], time_query, geo_query, extra_query_list)
tsdb_queries.append(tsdb_query)
# For each query generated for each orig_tsdb key, fetch count of matching documents
for query in tsdb_queries:
entries_count = tsdb.count_documents(query)
tsdb_counts.append(entries_count)
else:
tsdb_queries = self._get_query(key_list, time_query, geo_query, extra_query_list)
entries_count = tsdb.count_documents(tsdb_queries)
tsdb_counts = [entries_count]
orig_tsdb_count = self._get_entries_for_timeseries(orig_tsdb, orig_tsdb_keys, time_query, geo_query, extra_query_list, None)[0]
analysis_tsdb_count = self._get_entries_for_timeseries(analysis_tsdb, analysis_tsdb_keys, time_query, geo_query, extra_query_list, None)[0]

return tsdb_counts
total_matching_count = orig_tsdb_count + analysis_tsdb_count
return total_matching_count


52 changes: 31 additions & 21 deletions emission/tests/storageTests/TestTimeSeries.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,35 +84,40 @@ def testExtraQueries(self):
def testFindEntriesCount(self):
'''
Test: Specific keys with other parameters not passed values.
Input: A set of keys from either of the two timeseries databases.
Output: A tuple of two lists (one for each timeseries database). Length of list depends on number of keys for that specific timeseries database.
Input: For each dataset: ["background/location", "background/filtered_location", "analysis/confirmed_trip"]
Input: A list of keys from either of the timeseries databases.
- For each dataset: ["background/location", "background/filtered_location", "analysis/confirmed_trip"]
- Testing this with sample dataset: "shankari_2015-aug-21", "shankari_2015-aug-27"
Output: Aug_21: ([738, 508], [0]), Aug_27: ([555, 327], [0])
- Actual output just returns a single number for count of entries.
Outputs: Single number representing total count of matching entries.
- For builtin_timeseries: Returns total count of all entries matching the userid.
- For aggregate_timeseries: Returns total count of all entries matching all users.
- Validated using grep count of occurrences for keys: 1) "background/location" 2) "background/filtered_location" 3) "analysis/confirmed_trip"
- Syntax: $ grep -c <key> <dataset>.json
- Sample: $ grep -c "background/location" emission/tests/data/real_examples/shankari_2015-aug-21
- Grep Output Counts For Aug-21 dataset for each key:
1) background/location = 738, 2) background/filtered_location = 508, 3) analysis/confirmed_trip = 0
Hence total count = 738 + 508 + 0 = 1246
- Grep Output Counts For Aug-27 dataset for each key:
1) background/location = 555, 2) background/filtered_location = 327, 3) analysis/confirmed_trip = 0
Hence total count = 555 + 327 + 0 = 882
For Aggregate Timeseries test case:
- The expected output would be summed-up values for the respective keys from the individual users testing outputs mentioned above.
- Output: ([1293, 835], [0])
- Input: []
- Output: 3607
- 3607 = 2125 (UUID1) + 1482 (UUID2)
- Input: ["background/location", "background/filtered_location", "analysis/confirmed_trip"]
- Output: 2128
- For each of the 3 input keys from key_list1:
- 1293 = 738 (UUID1) + 555 (UUID2)
- 835 = 508 (UUID1) + 327 (UUID2)
- 0 = 0 (UUID1) + 0 (UUID2)
Empty/Blank keys
- Empty array is returned in case there were no keys pertaining to the respective timeseries database.
- This is to differentiate from the [0] case where a key might be present in the input but no matching documents found.
- Whereas in this case of [], no key was present in the input itself.
- Hence total count = 1293 + 835 + 0 = 2128
'''

Expand All @@ -122,46 +127,51 @@ def testFindEntriesCount(self):
# Test case: Combination of original and analysis timeseries DB keys for Aug-21 dataset
key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
count_ts1 = ts1_aug_21.find_entries_count(key_list=key_list1)
self.assertEqual(count_ts1, ([738, 508], [0]))
self.assertEqual(count_ts1, 1246)

# Test case: Combination of original and analysis timeseries DB keys for Aug-27 dataset
key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
count_ts2 = ts2_aug_27.find_entries_count(key_list=key_list1)
self.assertEqual(count_ts2, ([555, 327], [0]))
self.assertEqual(count_ts2, 882)

# Test case: Only original timeseries DB keys for Aug-27 dataset
key_list2=["background/location", "background/filtered_location"]
count_ts3 = ts2_aug_27.find_entries_count(key_list=key_list2)
self.assertEqual(count_ts3, ([555, 327], []))
self.assertEqual(count_ts3, 882)

# Test case: Only analysis timeseries DB keys
key_list3=["analysis/confirmed_trip"]
count_ts4 = ts2_aug_27.find_entries_count(key_list=key_list3)
self.assertEqual(count_ts4, ([], [0]))
self.assertEqual(count_ts4, 0)

# Test case: Empty key_list which should return total count of all documents in the two DBs
key_list4=[]
count_ts5 = ts1_aug_21.find_entries_count(key_list=key_list4)
self.assertEqual(count_ts5, ([2125], [0]))
self.assertEqual(count_ts5, 2125)

# Test case: Invalid or unmatched key in metadata field
key_list5=["randomxyz_123test"]
with self.assertRaises(KeyError) as ke:
count_ts6 = ts1_aug_21.find_entries_count(key_list=key_list5)
self.assertEqual(str(ke.exception), "'randomxyz_123test'")

# Test case: Aggregate timeseries DB User data passed as input
# Test case: Aggregate timeseries DB User data passed as input with non-empty key_list
ts_agg = esta.TimeSeries.get_aggregate_time_series()
count_ts7 = ts_agg.find_entries_count(key_list=key_list1)
self.assertEqual(count_ts7, ([1293, 835], [0]))
self.assertEqual(count_ts7, 2128)

# Test case: Aggregate timeseries DB User data passed as input with empty key_list
ts_agg = esta.TimeSeries.get_aggregate_time_series()
count_ts8 = ts_agg.find_entries_count(key_list=key_list4)
self.assertEqual(count_ts8, 3607)

# Test case: New User created with no data to check
self.testEmail = None
self.testUUID2 = self.testUUID
etc.createAndFillUUID(self)
ts_new_user = esta.TimeSeries.get_time_series(self.testUUID)
count_ts8 = ts_new_user.find_entries_count(key_list=key_list1)
self.assertEqual(count_ts8, ([0, 0], [0]))
count_ts9 = ts_new_user.find_entries_count(key_list=key_list1)
self.assertEqual(count_ts9, 0)

print("Assert Test for Count Data successful!")

Expand Down

0 comments on commit 825d4ce

Please sign in to comment.