Skip to content

Commit

Permalink
Improvised Testcases for edge cases + Refactored logic
Browse files Browse the repository at this point in the history
Code Logic
- Updated the code to return counts from both original_timeseries and analysis_timeseries as a tuple of lists pertaining to each key for each timeseries db.

Test cases:
- Updated test cases to include counts from both DBs.
- Included new test cases for edge conditions.
- Test cases include:
1. Key_list of keys from both DBs
2. Key_list of keys from Original timeseries DB.
3. Key_list of keys from Analysis timeseries DB.
4. Empty key_list
5. Invalid keys
6. Aggregate timeseries DB data as input.
7. New user without any data
  • Loading branch information
Mahadik, Mukul Chandrakant authored and Mahadik, Mukul Chandrakant committed Sep 5, 2023
1 parent bb88af4 commit 821478c
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 17 deletions.
59 changes: 52 additions & 7 deletions emission/storage/timeseries/builtin_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,18 +440,63 @@ def update_data(user_id, key, obj_id, data):
logging.debug("updating entry %s into timeseries" % new_entry)
edb.save(ts.get_timeseries_db(key), new_entry)

def find_entries_count(self, key, time_query = None, geo_query = None, extra_query_list = None):
def find_entries_count(self, key_list = None, time_query = None, geo_query = None, extra_query_list = None):
"""
Returns the total number of documents for the specific key referring to a timeseries db.
:param key: the metadata key we are querying for. Only supports one key for now.
Returns the total number of documents for the given key_list referring to each of the two timeseries db.
Input: Key list with keys from both timeseries DBs = [key1, key2, key3, key4, ...]
Suppose (key1, key2) are orig_tsdb keys and (key3, key4) are analysis_tsdb keys
Output: Tuple of lists = (orig_tsdb_count, analysis_tsdb_count)
= ([count_key1, count_key2, ...], [count_key3, count_key4, ...])
Orig_tsdb_count and Analysis_tsdb_count are lists containing counts of matching documents
for each key considered separately for the specific timeseries DB.
:param key_list: list of metadata keys we are querying for.
:param time_query: the time range in which to search the stream
:param geo_query: the query for a geographical area
:param extra_query_list: any additional queries to filter out data
For key_list = None, total count of all documents are returned for each of the matching timeseries DBs.
"""
logging.debug("builtin_timeseries.find_entries_count() called")
created_query = self._get_query([key], time_query, geo_query, extra_query_list)
result_dataset = self.get_timeseries_db(key)
total_entries = result_dataset.count_documents(created_query)
return total_entries

orig_tsdb = self.timeseries_db
analysis_tsdb = self.analysis_timeseries_db

orig_tsdb_counts = []
analysis_tsdb_counts = []

if key_list == [] or key_list is None:
key_list = None

# Segregate orig_tsdb and analysis_tsdb keys
(orig_tsdb_keys, analysis_tsdb_keys) = self._split_key_list(key_list)

orig_tsdb_counts = self._get_entries_counts_for_timeseries(orig_tsdb, orig_tsdb_keys, time_query, geo_query, extra_query_list)
analysis_tsdb_counts = self._get_entries_counts_for_timeseries(analysis_tsdb, analysis_tsdb_keys, time_query, geo_query, extra_query_list)

return (orig_tsdb_counts, analysis_tsdb_counts)


def _get_entries_counts_for_timeseries(self, tsdb, key_list, time_query, geo_query, extra_query_list):

tsdb_queries = []
tsdb_counts = []

# For each key in orig_tsdb keys, create a query
if key_list is not None:
for key in key_list:
tsdb_query = self._get_query([key], time_query, geo_query, extra_query_list)
tsdb_queries.append(tsdb_query)
# For each query generated for each orig_tsdb key, fetch count of matching documents
for query in tsdb_queries:
entries_count = tsdb.count_documents(query)
tsdb_counts.append(entries_count)
else:
tsdb_queries = self._get_query(key_list, time_query, geo_query, extra_query_list)
entries_count = tsdb.count_documents(tsdb_queries)
tsdb_counts = [entries_count]

return tsdb_counts


72 changes: 62 additions & 10 deletions emission/tests/storageTests/TestTimeSeries.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,24 +84,76 @@ def testExtraQueries(self):
def testFindEntriesCount(self):
'''
Test: Specific keys with other parameters not passed values.
Input: For each dataset: ["background/location", "background/filtered_location]
Input: For each dataset: ["background/location", "background/filtered_location", "analysis/confirmed_trip"]
- Testing this with sample dataset: "shankari_2015-aug-21", "shankari_2015-aug-27"
Output: Aug_21: [738, 508], Aug_27: [555, 327]
Output: Aug_21: ([738, 508], [0]), Aug_27: ([555, 327], [0])
- Actual output just returns a single number for count of entries.
- Validated using grep count of occurrences for keys: 1) "background/location" 2) "background/filtered_location"
- $ grep -c <key> <dataset>.json
For Aggregate Timeseries test case:
- UUID('e66d0a3a-4316-4d9d-ac66-ee3754081d09') is returned as only distinct user which is stored in monogDB as BinData datatype.
- Validated the count of documents for the keys using mongo DB access via terminal.
- Ran these queries inside mongo terminal to get the counts:
$ db.Stage_timeseries.find({$and: [{"user_id" : BinData(3,"5m0KOkMWTZ2sZu43VAgdCQ==")}, {"metadata.key" : "background/location"}]}).count()
$ db.Stage_timeseries.find({$and: [{"user_id" : BinData(3,"5m0KOkMWTZ2sZu43VAgdCQ==")}, {"metadata.key" : "background/filtered_location"}]}).count()
$ db.Stage_analysis_timeseries.find({$and: [{"user_id" : BinData(3,"5m0KOkMWTZ2sZu43VAgdCQ==")}, {"metadata.key" : "analysis/confirmed_trip"}]}).count()
- The counts returned were 1476, 1016, 5, respectively.
'''

ts1_aug_21 = esta.TimeSeries.get_time_series(self.testUUID1)
ts2_aug_27 = esta.TimeSeries.get_time_series(self.testUUID)

count_ts1 = [ts1_aug_21.find_entries_count(key="background/location"), ts1_aug_21.find_entries_count(key="background/filtered_location")]
print("\nEntry counts for location, filtered_location on {} = {}".format("Aug_21", count_ts1))
self.assertEqual(count_ts1, [738, 508])

count_ts2 = [ts2_aug_27.find_entries_count(key="background/location"), ts2_aug_27.find_entries_count(key="background/filtered_location")]
print("Entry counts for location, filtered_location on {} = {}".format("Aug_27", count_ts2))
self.assertEqual(count_ts2, [555, 327])

# Test case: Combination of original and analysis timeseries DB keys for Aug-21 dataset
key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
count_ts1 = ts1_aug_21.find_entries_count(key_list=key_list1)
self.assertEqual(count_ts1, ([738, 508], [0]))

# Test case: Combination of original and analysis timeseries DB keys for Aug-27 dataset
key_list1=["background/location", "background/filtered_location", "analysis/confirmed_trip"]
count_ts2 = ts2_aug_27.find_entries_count(key_list=key_list1)
self.assertEqual(count_ts2, ([555, 327], [0]))

# Test case: Only original timeseries DB keys for Aug-27 dataset
key_list2=["background/location", "background/filtered_location"]
count_ts3 = ts2_aug_27.find_entries_count(key_list=key_list2)
self.assertEqual(count_ts3, ([555, 327], []))

# Test case: Only analysis timeseries DB keys
key_list3=["analysis/confirmed_trip"]
count_ts4 = ts2_aug_27.find_entries_count(key_list=key_list3)
self.assertEqual(count_ts4, ([], [0]))

# Test case: Empty key_list which should return total count of all documents in the two DBs
key_list4=[]
count_ts5 = ts1_aug_21.find_entries_count(key_list=key_list4)
self.assertEqual(count_ts5, ([2125], [0]))

# Test case: Invalid or unmatched key in metadata field
key_list5=["randomxyz_123test"]
with self.assertRaises(KeyError) as ke:
count_ts6 = ts1_aug_21.find_entries_count(key_list=key_list5)
self.assertEqual(str(ke.exception), "'randomxyz_123test'")

# Test case: Aggregate timeseries DB User data passed as input
ts_agg = esta.TimeSeries.get_aggregate_time_series()
users_distinct = ts_agg.get_distinct_users()
for uuid in users_distinct:
ts_user_ag = esta.TimeSeries.get_time_series(uuid)
count_ts7 = ts_user_ag.find_entries_count(key_list=key_list1)
self.assertEqual(count_ts7, ([1476, 1016], [5]))

# Test case: New User created with no data to check
self.testEmail = None
self.testUUID2 = self.testUUID
etc.createAndFillUUID(self)
ts_new_user = esta.TimeSeries.get_time_series(self.testUUID)
count_ts8 = ts_new_user.find_entries_count(key_list=key_list1)
self.assertEqual(count_ts8, ([0, 0], [0]))
self.testUUID = self.testUUID2
self.testEmail = "user2"

print("Assert Test for Count Data successful!")


Expand Down

0 comments on commit 821478c

Please sign in to comment.