From 0406d88b67cf942860ecde70f4b89afc2015ab74 Mon Sep 17 00:00:00 2001 From: Zhou Yuan Date: Mon, 17 Jul 2023 18:08:56 -0400 Subject: [PATCH 1/5] Bump version to 2.1.31 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index ac127842..98dd696e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.30 +2.1.31 From 9a41dcbd86fc0e66b1e65f6c833b66c29aecb3d9 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 17 Jul 2023 19:51:14 -0400 Subject: [PATCH 2/5] Debug cache details --- src/schema/schema_manager.py | 44 ++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index e1355e3b..23df80d3 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -513,11 +513,10 @@ def get_complete_entity_result(token, entity_dict, properties_to_skip = []): cache_result = None # Need both client and prefix when fetching the cache - if _memcached_client and _memcached_prefix: - # Do NOT fetch cache is properties_to_skip is specified - if not properties_to_skip: - cache_key = f'{_memcached_prefix}_complete_{entity_uuid}' - cache_result = _memcached_client.get(cache_key) + # Do NOT fetch cache is properties_to_skip is specified + if _memcached_client and _memcached_prefix and (not properties_to_skip): + cache_key = f'{_memcached_prefix}_complete_{entity_uuid}' + cache_result = _memcached_client.get(cache_key) # Use the cached data if found and still valid # Otherwise, calculate and add to cache @@ -537,15 +536,18 @@ def get_complete_entity_result(token, entity_dict, properties_to_skip = []): complete_entity = remove_none_values(complete_entity_dict) # Need both client and prefix when creating the cache - if _memcached_client and _memcached_prefix: + # Do NOT cache when properties_to_skip is specified + if _memcached_client and _memcached_prefix and (not properties_to_skip): logger.info(f'Creating complete entity cache of {entity_type} {entity_uuid} at time {datetime.now()}') - # Do NOT cache when properties_to_skip is specified - if not properties_to_skip: - cache_key = f'{_memcached_prefix}_complete_{entity_uuid}' - _memcached_client.set(cache_key, complete_entity, expire = SchemaConstants.MEMCACHED_TTL) + cache_key = f'{_memcached_prefix}_complete_{entity_uuid}' + _memcached_client.set(cache_key, complete_entity, expire = SchemaConstants.MEMCACHED_TTL) + + logger.debug(f"Following is the complete {entity_type} cache created at time {datetime.now()} using key {cache_key}:") + logger.debug(complete_entity) else: logger.info(f'Using complete entity cache of {entity_type} {entity_uuid} at time {datetime.now()}') + logger.debug(cache_result) complete_entity = cache_result else: @@ -648,11 +650,10 @@ def normalize_entity_result_for_response(entity_dict, properties_to_exclude = [] cache_result = None # Need both client and prefix when fetching the cache - if _memcached_client and _memcached_prefix: - # Do NOT fetch cache is properties_to_exclude is specified - if not properties_to_exclude: - cache_key = f'{_memcached_prefix}_normalized_{entity_uuid}' - cache_result = _memcached_client.get(cache_key) + # Do NOT fetch cache is properties_to_exclude is specified + if _memcached_client and _memcached_prefix and (not properties_to_exclude): + cache_key = f'{_memcached_prefix}_normalized_{entity_uuid}' + cache_result = _memcached_client.get(cache_key) # Use the cached data if found and still valid # Otherwise, calculate and add to cache @@ -689,15 +690,18 @@ def normalize_entity_result_for_response(entity_dict, properties_to_exclude = [] normalized_entity.pop(key) # Need both client and prefix when creating the cache - if _memcached_client and _memcached_prefix: + # Do NOT cache when properties_to_exclude is specified + if _memcached_client and _memcached_prefix and (not properties_to_exclude): logger.info(f'Creating normalized entity cache of {entity_type} {entity_uuid} at time {datetime.now()}') - # Do NOT cache when properties_to_exclude is specified - if not properties_to_exclude: - cache_key = f'{_memcached_prefix}_normalized_{entity_uuid}' - _memcached_client.set(cache_key, normalized_entity, expire = SchemaConstants.MEMCACHED_TTL) + cache_key = f'{_memcached_prefix}_normalized_{entity_uuid}' + _memcached_client.set(cache_key, normalized_entity, expire = SchemaConstants.MEMCACHED_TTL) + + logger.debug(f"Following is the normalized {entity_type} cache created at time {datetime.now()} using key {cache_key}:") + logger.debug(normalized_entity) else: logger.info(f'Using normalized entity cache of {entity_type} {entity_uuid} at time {datetime.now()}') + logger.debug(cache_result) normalized_entity = cache_result From aab39ae5928333aa2b15c0249d6175d92508b179 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 17 Jul 2023 22:22:12 -0400 Subject: [PATCH 3/5] Fix nested fields with metadata only --- src/schema/schema_manager.py | 8 ++-- src/schema/schema_triggers.py | 85 +++++++++-------------------------- 2 files changed, 24 insertions(+), 69 deletions(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 23df80d3..21fcffc5 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -483,7 +483,7 @@ def remove_transient_and_none_values(merged_dict, normalized_entity_type): """ -Generate the complete entity record as well as result filtering for response +Generate the complete entity record by running the read triggers Parameters ---------- @@ -513,7 +513,7 @@ def get_complete_entity_result(token, entity_dict, properties_to_skip = []): cache_result = None # Need both client and prefix when fetching the cache - # Do NOT fetch cache is properties_to_skip is specified + # Do NOT fetch cache if properties_to_skip is specified if _memcached_client and _memcached_prefix and (not properties_to_skip): cache_key = f'{_memcached_prefix}_complete_{entity_uuid}' cache_result = _memcached_client.get(cache_key) @@ -524,7 +524,7 @@ def get_complete_entity_result(token, entity_dict, properties_to_skip = []): if _memcached_client and _memcached_prefix: logger.info(f'Cache of complete entity of {entity_type} {entity_uuid} not found or expired at time {datetime.now()}') - # No error handling here since if a 'on_read_trigger' method failed, + # No error handling here since if a 'on_read_trigger' method fails, # the property value will be the error message # Pass {} since no new_data_dict for 'on_read_trigger' generated_on_read_trigger_data_dict = generate_triggered_data('on_read_trigger', entity_type, token, entity_dict, {}, properties_to_skip) @@ -626,7 +626,7 @@ def normalize_activity_result_for_response(activity_dict, properties_to_exclude Parameters ---------- entity_dict : dict - A merged dictionary that contains all possible data to be used by the trigger methods + Either a neo4j node converted dict or complete dict generated from get_complete_entity_result() properties_to_exclude : list Any additional properties to exclude from the response diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 466a77ba..297bae3f 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -701,16 +701,11 @@ def get_dataset_collections(property_key, normalized_type, user_token, existing_ logger.info(f"Executing 'get_dataset_collections()' trigger method on uuid: {existing_data_dict['uuid']}") - # No property key needs to filter the result - # Get back the list of collection dicts collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - # Exclude datasets from each resulting collection - # We don't want to show too much nested information - properties_to_skip = ['datasets'] - complete_entities_list = schema_manager.get_complete_entities_list(user_token, collections_list, properties_to_skip) - - return property_key, schema_manager.normalize_entities_list_for_response(complete_entities_list) + # Get rid of the entity node properties that are not defined in the yaml schema + # as well as the ones defined as `exposed: false` in the yaml schema + return property_key, schema_manager.normalize_entities_list_for_response(collections_list) """ Trigger event method of getting the associated collection for this publication @@ -739,16 +734,11 @@ def get_publication_associated_collection(property_key, normalized_type, user_to logger.info(f"Executing 'get_publication_associated_collection()' trigger method on uuid: {existing_data_dict['uuid']}") - # No property key needs to filter the result - # Get back the associated collection dict - collection = schema_neo4j_queries.get_publication_associated_collection(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + collection_dict = schema_neo4j_queries.get_publication_associated_collection(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - # Exclude datasets from the resulting collection - # We don't want to show too much nested information - properties_to_skip = ['datasets'] - complete_entity = schema_manager.get_complete_entity_result(user_token, collection, properties_to_skip) - - return property_key, schema_manager.normalize_entity_result_for_response(complete_entity) + # Get rid of the entity node properties that are not defined in the yaml schema + # as well as the ones defined as `exposed: false` in the yaml schema + return property_key, schema_manager.normalize_entity_result_for_response(collection_dict) """ Trigger event method of getting the associated Upload for this Dataset @@ -779,17 +769,11 @@ def get_dataset_upload(property_key, normalized_type, user_token, existing_data_ logger.info(f"Executing 'get_dataset_upload()' trigger method on uuid: {existing_data_dict['uuid']}") - # It could be None if the dataset doesn't in any Upload upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - if upload_dict: - # Exclude datasets from each resulting Upload - # We don't want to show too much nested information - properties_to_skip = ['datasets'] - complete_upload_dict = schema_manager.get_complete_entity_result(user_token, upload_dict, properties_to_skip) - return_dict = schema_manager.normalize_entity_result_for_response(complete_upload_dict) - - return property_key, return_dict + # Get rid of the entity node properties that are not defined in the yaml schema + # as well as the ones defined as `exposed: false` in the yaml schema + return property_key, schema_manager.normalize_entity_result_for_response(upload_dict) """ @@ -873,14 +857,14 @@ def link_collection_to_datasets(property_key, normalized_type, user_token, exist raise """ -Trigger event method of getting source uuid +Trigger event method of getting a list of direct ancestors for a given dataset or publication Parameters ---------- property_key : str The target property key of the value to be generated normalized_type : str - One of the types defined in the schema yaml: Dataset + One of the types defined in the schema yaml: Dataset/Publication user_token: str The user's globus nexus token existing_data_dict : dict @@ -899,18 +883,11 @@ def get_dataset_direct_ancestors(property_key, normalized_type, user_token, exis logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}") - # No property key needs to filter the result - # Get back the list of ancestor dicts direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - # We don't want to show too much nested information - # The direct ancestor of a Dataset could be: Dataset or Sample - # Skip running the trigger methods for 'direct_ancestors' and 'collections' if the direct ancestor is Dataset - # Skip running the trigger methods for 'direct_ancestor' if the direct ancestor is Sample - properties_to_skip = ['direct_ancestors', 'collections', 'direct_ancestor'] - complete_entities_list = schema_manager.get_complete_entities_list(user_token, direct_ancestors_list, properties_to_skip) - - return property_key, schema_manager.normalize_entities_list_for_response(complete_entities_list) + # Get rid of the entity node properties that are not defined in the yaml schema + # as well as the ones defined as `exposed: false` in the yaml schema + return property_key, schema_manager.normalize_entities_list_for_response(direct_ancestors_list) """ @@ -1143,8 +1120,6 @@ def get_previous_revision_uuid(property_key, normalized_type, user_token, existi previous_revision_uuid = schema_neo4j_queries.get_previous_revision_uuid(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - # previous_revision_uuid can be None, but will be filtered out by - # schema_manager.normalize_entity_result_for_response() return property_key, previous_revision_uuid @@ -1177,8 +1152,6 @@ def get_next_revision_uuid(property_key, normalized_type, user_token, existing_d next_revision_uuid = schema_neo4j_queries.get_next_revision_uuid(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - # next_revision_uuid can be None, but will be filtered out by - # schema_manager.normalize_entity_result_for_response() return property_key, next_revision_uuid @@ -1572,7 +1545,7 @@ def link_publication_to_associated_collection(property_key, normalized_type, use raise """ -Trigger event method of getting the parent of a Sample +Trigger event method of getting the parent of a Sample, which is a Donor Parameters ---------- @@ -1600,16 +1573,9 @@ def get_sample_direct_ancestor(property_key, normalized_type, user_token, existi direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - if 'entity_type' not in direct_ancestor_dict: - raise KeyError("The 'entity_type' property in the resulting 'direct_ancestor_dict' is not set during calling 'get_sample_direct_ancestor()' trigger method.") - - # Generate trigger data for sample's direct_ancestor and skip the direct_ancestor's direct_ancestor - properties_to_skip = ['direct_ancestor'] - complete_dict = schema_manager.get_complete_entity_result(user_token, direct_ancestor_dict, properties_to_skip) - # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema - return property_key, schema_manager.normalize_entity_result_for_response(complete_dict) + return property_key, schema_manager.normalize_entity_result_for_response(direct_ancestor_dict) """ @@ -1917,20 +1883,9 @@ def get_upload_datasets(property_key, normalized_type, user_token, existing_data datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - # Additional properties of the datasets to exclude - # We don't want to show too much nested information due to performance consideration - properties_to_skip = [ - 'direct_ancestors', - 'collections', - 'upload', - 'title', - 'previous_revision_uuid', - 'next_revision_uuid' - ] - - complete_entities_list = schema_manager.get_complete_entities_list(user_token, datasets_list, properties_to_skip) - - return property_key, schema_manager.normalize_entities_list_for_response(complete_entities_list) + # Get rid of the entity node properties that are not defined in the yaml schema + # as well as the ones defined as `exposed: false` in the yaml schema + return property_key, schema_manager.normalize_entities_list_for_response(datasets_list) #################################################################################################### From dc2c02ac26d5b5bc6ec6cc1c0732be2c3780e507 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 17 Jul 2023 22:34:53 -0400 Subject: [PATCH 4/5] Return all neo4j properties for Collection.datasets --- src/schema/schema_triggers.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 297bae3f..c570d2d4 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -589,23 +589,9 @@ def get_collection_datasets(property_key, normalized_type, user_token, existing_ datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - # These are the node properties stored in Neo4j - # None of them needs to be generated via a read trigger - properties_to_return = [ - 'uuid', - 'hubmap_id', - 'data_types', - 'status', - 'last_modified_timestamp', - 'created_by_user_displayname' - ] - - for dataset_dict in datasets_list: - for key in list(dataset_dict): - if key not in properties_to_return: - dataset_dict.pop(key) - - return property_key, datasets_list + # Get rid of the entity node properties that are not defined in the yaml schema + # as well as the ones defined as `exposed: false` in the yaml schema + return property_key, schema_manager.normalize_entities_list_for_response(datasets_list) #################################################################################################### From c10be522bae32f82bacc9acd3c3abdd5127a53cb Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Mon, 17 Jul 2023 22:45:22 -0400 Subject: [PATCH 5/5] No cache on normalized response --- src/schema/schema_manager.py | 97 +++++++++++------------------------- 1 file changed, 30 insertions(+), 67 deletions(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 21fcffc5..2386b013 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -592,15 +592,13 @@ def get_complete_entities_list(token, entities_list, properties_to_skip = []): ---------- activity_dict : dict A dictionary that contains all activity details -properties_to_exclude : list - Any additional properties to exclude from the response Returns ------- dict A dictionary of activity information with keys that are all normalized """ -def normalize_activity_result_for_response(activity_dict, properties_to_exclude = []): +def normalize_activity_result_for_response(activity_dict): global _schema properties = _schema['ACTIVITIES']['Activity']['properties'] @@ -609,7 +607,7 @@ def normalize_activity_result_for_response(activity_dict, properties_to_exclude for key in activity_dict: # Only return the properties defined in the schema yaml # Exclude additional properties if specified - if (key in properties) and (key not in properties_to_exclude): + if key in properties: # By default, all properties are exposed # It's possible to see `exposed: true` if ('exposed' not in properties[key]) or (('exposed' in properties[key]) and properties[key]['exposed']): @@ -637,75 +635,41 @@ def normalize_activity_result_for_response(activity_dict, properties_to_exclude """ def normalize_entity_result_for_response(entity_dict, properties_to_exclude = []): global _schema - global _memcached_client - global _memcached_prefix normalized_entity = {} # In case entity_dict is None or # an incorrectly created entity that doesn't have the `entity_type` property - if entity_dict and ('entity_type' in entity_dict) and ('uuid' in entity_dict): - entity_uuid = entity_dict['uuid'] - entity_type = entity_dict['entity_type'] - cache_result = None - - # Need both client and prefix when fetching the cache - # Do NOT fetch cache is properties_to_exclude is specified - if _memcached_client and _memcached_prefix and (not properties_to_exclude): - cache_key = f'{_memcached_prefix}_normalized_{entity_uuid}' - cache_result = _memcached_client.get(cache_key) - - # Use the cached data if found and still valid - # Otherwise, calculate and add to cache - if cache_result is None: - if _memcached_client and _memcached_prefix: - logger.info(f'Cache of normalized entity of {entity_type} {entity_uuid} not found or expired at time {datetime.now()}') - - properties = _schema['ENTITIES'][entity_type]['properties'] - - for key in entity_dict: - # Only return the properties defined in the schema yaml - # Exclude additional properties if specified - if (key in properties) and (key not in properties_to_exclude): - # Skip properties with None value and the ones that are marked as not to be exposed. - # By default, all properties are exposed if not marked as `exposed: false` - # It's still possible to see `exposed: true` marked explictly - if (entity_dict[key] is not None) and ('exposed' not in properties[key]) or (('exposed' in properties[key]) and properties[key]['exposed']): - # Only run convert_str_literal() on string representation of Python dict and list with removing control characters - # No convertion for string representation of Python string, meaning that can still contain control characters - if entity_dict[key] and (properties[key]['type'] in ['list', 'json_string']): - logger.info(f"Executing convert_str_literal() on {entity_type}.{key} of uuid: {entity_uuid}") - - # Safely evaluate a string containing a Python dict or list literal - # Only convert to Python list/dict when the string literal is not empty - # instead of returning the json-as-string or array-as-string - # convert_str_literal() also removes those control chars to avoid SyntaxError - entity_dict[key] = convert_str_literal(entity_dict[key]) - - # Add the target key with correct value of data type to the normalized_entity dict - normalized_entity[key] = entity_dict[key] - - # Final step: remove properties with empty string value, empty dict {}, and empty list [] - if (isinstance(normalized_entity[key], (str, dict, list)) and (not normalized_entity[key])): - normalized_entity.pop(key) - - # Need both client and prefix when creating the cache - # Do NOT cache when properties_to_exclude is specified - if _memcached_client and _memcached_prefix and (not properties_to_exclude): - logger.info(f'Creating normalized entity cache of {entity_type} {entity_uuid} at time {datetime.now()}') - - cache_key = f'{_memcached_prefix}_normalized_{entity_uuid}' - _memcached_client.set(cache_key, normalized_entity, expire = SchemaConstants.MEMCACHED_TTL) - - logger.debug(f"Following is the normalized {entity_type} cache created at time {datetime.now()} using key {cache_key}:") - logger.debug(normalized_entity) - else: - logger.info(f'Using normalized entity cache of {entity_type} {entity_uuid} at time {datetime.now()}') - logger.debug(cache_result) + if entity_dict and ('entity_type' in entity_dict): + normalized_entity_type = entity_dict['entity_type'] + properties = _schema['ENTITIES'][normalized_entity_type]['properties'] + + for key in entity_dict: + # Only return the properties defined in the schema yaml + # Exclude additional properties if specified + if (key in properties) and (key not in properties_to_exclude): + # Skip properties with None value and the ones that are marked as not to be exposed. + # By default, all properties are exposed if not marked as `exposed: false` + # It's still possible to see `exposed: true` marked explictly + if (entity_dict[key] is not None) and ('exposed' not in properties[key]) or (('exposed' in properties[key]) and properties[key]['exposed']): + # Only run convert_str_literal() on string representation of Python dict and list with removing control characters + # No convertion for string representation of Python string, meaning that can still contain control characters + if entity_dict[key] and (properties[key]['type'] in ['list', 'json_string']): + logger.info(f"Executing convert_str_literal() on {normalized_entity_type}.{key} of uuid: {entity_dict['uuid']}") + + # Safely evaluate a string containing a Python dict or list literal + # Only convert to Python list/dict when the string literal is not empty + # instead of returning the json-as-string or array-as-string + # convert_str_literal() also removes those control chars to avoid SyntaxError + entity_dict[key] = convert_str_literal(entity_dict[key]) + + # Add the target key with correct value of data type to the normalized_entity dict + normalized_entity[key] = entity_dict[key] - normalized_entity = cache_result + # Final step: remove properties with empty string value, empty dict {}, and empty list [] + if (isinstance(normalized_entity[key], (str, dict, list)) and (not normalized_entity[key])): + normalized_entity.pop(key) - # One final return return normalized_entity @@ -1794,7 +1758,6 @@ def delete_memcached_cache(uuids_list): for uuid in uuids_list: cache_keys.append(f'{_memcached_prefix}_neo4j_{uuid}') cache_keys.append(f'{_memcached_prefix}_complete_{uuid}') - cache_keys.append(f'{_memcached_prefix}_normalized_{uuid}') _memcached_client.delete_many(cache_keys)