From 2f9ec567ebe1df9fccb05f139d2f669661e50018 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Thu, 22 Jun 2023 16:08:28 -0700 Subject: [PATCH] Addition of ER EntityKeyTable and bug fixes (#872) --- CHANGELOG.md | 5 + docs/gallery/plot_external_resources.py | 16 +- src/hdmf/common/hdmf-common-schema | 2 +- src/hdmf/common/io/resources.py | 6 +- src/hdmf/common/resources.py | 326 ++++++++++----------- tests/unit/common/test_resources.py | 372 ++++++++++++++++++------ 6 files changed, 458 insertions(+), 269 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad3d40228..9810a16b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # HDMF Changelog +## HMDF 3.6.2 (Upcoming) + +### New features and minor improvements +- Updated `ExternalResources` to have EntityKeyTable with updated tests/documentation and minor bug fix to ObjectKeyTable. @mavaylon1 [#872](https://github.com/hdmf-dev/hdmf/pull/872) + ## HMDF 3.6.1 (May 18, 2023) ### Bug fixes diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py index d8ed891fb..c8748c0fe 100644 --- a/docs/gallery/plot_external_resources.py +++ b/docs/gallery/plot_external_resources.py @@ -91,8 +91,8 @@ from hdmf.common import DynamicTable, VectorData from hdmf import Container, ExternalResourcesManager from hdmf import Data -from hdmf.testing import remove_test_file import numpy as np +import os # Ignore experimental feature warnings in the tutorial to improve rendering import warnings warnings.filterwarnings("ignore", category=UserWarning, message="ExternalResources is experimental*") @@ -306,17 +306,17 @@ def __init__(self, **kwargs): ############################################################################### # Write ExternalResources # ------------------------------------------------------ -# :py:class:`~hdmf.common.resources.ExternalResources` is written as a flattened tsv file. -# The user provides the path, which contains the name of the file, to where the tsv -# file will be written. +# :py:class:`~hdmf.common.resources.ExternalResources` is written as a zip file of +# the individual tables written to tsv. +# The user provides the path, which contains the name of the directory. -er.to_flat_tsv(path='./er_example.tsv') +er.to_norm_tsv(path='./') ############################################################################### # Read ExternalResources # ------------------------------------------------------ # Users can read :py:class:`~hdmf.common.resources.ExternalResources` from the tsv format -# by providing the path to the file. +# by providing the path to the directory. -er_read = ExternalResources.from_flat_tsv(path='./er_example.tsv') -remove_test_file('./er_example.tsv') +er_read = ExternalResources.from_norm_tsv(path='./') +os.remove('./er.zip') diff --git a/src/hdmf/common/hdmf-common-schema b/src/hdmf/common/hdmf-common-schema index b82320919..144552a4e 160000 --- a/src/hdmf/common/hdmf-common-schema +++ b/src/hdmf/common/hdmf-common-schema @@ -1 +1 @@ -Subproject commit b82320919c64e9d1540d7de3f8c88ef5d12d9de9 +Subproject commit 144552a4e9ad43ea0aa040d94467ffa6ee980a98 diff --git a/src/hdmf/common/io/resources.py b/src/hdmf/common/io/resources.py index 6ecf7088a..5d4823b47 100644 --- a/src/hdmf/common/io/resources.py +++ b/src/hdmf/common/io/resources.py @@ -1,5 +1,5 @@ from .. import register_map -from ..resources import ExternalResources, KeyTable, FileTable, ObjectTable, ObjectKeyTable, EntityTable +from ..resources import ExternalResources, KeyTable, FileTable, ObjectTable, ObjectKeyTable, EntityTable, EntityKeyTable from ...build import ObjectMapper @@ -38,3 +38,7 @@ def objects(self, builder, manager): @ObjectMapper.constructor_arg('object_keys') def object_keys(self, builder, manager): return self.construct_helper('object_keys', builder, ObjectKeyTable, manager) + + @ObjectMapper.constructor_arg('entity_keys') + def entity_keys(self, builder, manager): + return self.construct_helper('entity_keys', builder, EntityKeyTable, manager) diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py index 1f1e3b1c9..410fc6f10 100644 --- a/src/hdmf/common/resources.py +++ b/src/hdmf/common/resources.py @@ -7,6 +7,7 @@ from ..build import TypeMap from glob import glob import os +import zipfile class KeyTable(Table): @@ -38,9 +39,6 @@ class EntityTable(Table): __defaultname__ = 'entities' __columns__ = ( - {'name': 'keys_idx', 'type': (int, Key), - 'doc': ('The index into the keys table for the user key that ' - 'maps to the resource term / registry symbol.')}, {'name': 'entity_id', 'type': str, 'doc': 'The unique ID for the resource term / registry symbol.'}, {'name': 'entity_uri', 'type': str, @@ -123,6 +121,29 @@ class ObjectKeyTable(Table): ) +class EntityKeyTable(Table): + """ + A table for identifying which entities are used by which keys for referring to external resources. + """ + + __defaultname__ = 'entity_keys' + + __columns__ = ( + {'name': 'entities_idx', 'type': (int, Entity), + 'doc': 'The index into the EntityTable for the Entity that associated with the Key.'}, + {'name': 'keys_idx', 'type': (int, Key), + 'doc': 'The index into the KeyTable that is used to make an external resource reference.'} + ) + + +class EntityKey(Row): + """ + A Row class for representing rows in the EntityKeyTable. + """ + + __table__ = EntityKeyTable + + class ObjectKey(Row): """ A Row class for representing rows in the ObjectKeyTable. @@ -140,6 +161,7 @@ class ExternalResources(Container): {'name': 'files', 'child': True}, {'name': 'objects', 'child': True}, {'name': 'object_keys', 'child': True}, + {'name': 'entity_keys', 'child': True}, {'name': 'entities', 'child': True}, ) @@ -152,7 +174,9 @@ class ExternalResources(Container): {'name': 'objects', 'type': ObjectTable, 'default': None, 'doc': 'The table storing object information.'}, {'name': 'object_keys', 'type': ObjectKeyTable, 'default': None, - 'doc': 'The table storing object-resource relationships.'}, + 'doc': 'The table storing object-key relationships.'}, + {'name': 'entity_keys', 'type': EntityKeyTable, 'default': None, + 'doc': 'The table storing entity-key relationships.'}, {'name': 'type_map', 'type': TypeMap, 'default': None, 'doc': 'The type map. If None is provided, the HDMF-common type map will be used.'}, allow_positional=AllowPositional.WARNING) @@ -164,6 +188,7 @@ def __init__(self, **kwargs): self.entities = kwargs['entities'] or EntityTable() self.objects = kwargs['objects'] or ObjectTable() self.object_keys = kwargs['object_keys'] or ObjectKeyTable() + self.entity_keys = kwargs['entity_keys'] or EntityKeyTable() self.type_map = kwargs['type_map'] or get_type_map() @staticmethod @@ -243,19 +268,15 @@ def _add_file(self, **kwargs): file_object_id = kwargs['file_object_id'] return File(file_object_id, table=self.files) - @docval({'name': 'key', 'type': (str, Key), 'doc': 'The key to associate the entity with.'}, - {'name': 'entity_id', 'type': str, 'doc': 'The unique entity id.'}, + @docval({'name': 'entity_id', 'type': str, 'doc': 'The unique entity id.'}, {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the entity.'}) def _add_entity(self, **kwargs): """ - Add an entity that will be referenced to using the given key. + Add an entity that will be referenced to using keys specified in ExternalResources.entity_keys. """ - key = kwargs['key'] entity_id = kwargs['entity_id'] entity_uri = kwargs['entity_uri'] - if not isinstance(key, Key): - key = self._add_key(key) - entity = Entity(key, entity_id, entity_uri, table=self.entities) + entity = Entity( entity_id, entity_uri, table=self.entities) return entity @docval({'name': 'container', 'type': (str, AbstractContainer), @@ -298,6 +319,15 @@ def _add_object_key(self, **kwargs): obj, key = popargs('obj', 'key', kwargs) return ObjectKey(obj, key, table=self.object_keys) + @docval({'name': 'entity', 'type': (int, Entity), 'doc': 'The Entity associated with the Key.'}, + {'name': 'key', 'type': (int, Key), 'doc': 'The Key that the connected to the Entity.'}) + def _add_entity_key(self, **kwargs): + """ + Add entity-key relationship to the EntityKeyTable. + """ + entity, key = popargs('entity', 'key', kwargs) + return EntityKey(entity, key, table=self.entity_keys) + @docval({'name': 'file', 'type': ExternalResourcesManager, 'doc': 'The file associated with the container.'}, {'name': 'container', 'type': AbstractContainer, 'doc': ('The Container/Data object that uses the key or ' @@ -424,6 +454,15 @@ def get_key(self, **kwargs): else: return self.keys.row[key_idx_matches[0]] + @docval({'name': 'entity_id', 'type': str, 'doc': 'The ID for the identifier at the resource.'}) + def get_entity(self, **kwargs): + entity_id = kwargs['entity_id'] + entity = self.entities.which(entity_id=entity_id) + if len(entity)>0: + return self.entities.row[entity[0]] + else: + return None + @docval({'name': 'container', 'type': (str, AbstractContainer), 'default': None, 'doc': ('The Container/Data object that uses the key or ' 'the object_id for the Container/Data object that uses the key.')}, @@ -434,7 +473,7 @@ def get_key(self, **kwargs): {'name': 'key', 'type': (str, Key), 'default': None, 'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'}, {'name': 'entity_id', 'type': str, 'doc': 'The identifier for the entity at the resource.'}, - {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the identifier at the resource.'}, + {'name': 'entity_uri', 'type': str, 'doc': 'The URI for the identifier at the resource.', 'default': None}, {'name': 'file', 'type': ExternalResourcesManager, 'doc': 'The file associated with the container.', 'default': None}, ) @@ -512,12 +551,58 @@ def add_ref(self, **kwargs): msg = "Use Key Object when referencing an existing (container, relative_path, key)" raise ValueError(msg) - if not isinstance(key, Key): key = self._add_key(key) self._add_object_key(object_field, key) - entity = self._add_entity(key, entity_id, entity_uri) + else: + # Check to see that the existing key is being used with the object. + # If true, do nothing. If false, create a new obj/key relationship + # in the ObjectKeyTable + key_idx = key.idx + object_key_row_idx = self.object_keys.which(keys_idx=key_idx) + if len(object_key_row_idx)!=0: + obj_key_check = False + for row_idx in object_key_row_idx: + obj_idx = self.object_keys['objects_idx', row_idx] + if obj_idx == object_field.idx: + obj_key_check = True + if not obj_key_check: + self._add_object_key(object_field, key) + else: + msg = "Cannot find key object. Create new Key with string." + raise ValueError(msg) + # check if the key and object have been related in the ObjectKeyTable + entity = self.get_entity(entity_id=entity_id) + if entity is None: + if entity_uri is None: + msg = 'New entities must have an entity_uri.' + raise ValueError(msg) + entity = self._add_entity(entity_id, entity_uri) + self._add_entity_key(entity, key) + else: + if entity_uri is not None: + msg = 'If you plan on reusing an entity, then entity_uri parameter must be None.' + raise ValueError(msg) + # check for entity-key relationship in EntityKeyTable + key_idx = key.idx + entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) + if len(entity_key_row_idx)!=0: + # this means there exists rows where the key is in the EntityKeyTable + entity_key_check = False + for row_idx in entity_key_row_idx: + entity_idx = self.entity_keys['entities_idx', row_idx] + if entity_idx == entity.idx: + entity_key_check = True + # this means there is already a key-entity relationship recorded + if not entity_key_check: + # this means that though the key is there, there is not key-entity relationship + # a.k.a add it now + self._add_entity_key(entity, key) + else: + # this means that specific key is not in the EntityKeyTable, so add it and establish + # the relationship with the entity + self._add_entity_key(entity, key) return key, entity @docval({'name': 'object_type', 'type': str, @@ -594,17 +679,11 @@ def get_object_entities(self, **kwargs): keys.append(self.object_keys['keys_idx', row_idx]) # Find all the entities/resources for each key. for key_idx in keys: - entity_idx = self.entities.which(keys_idx=key_idx) - entities.append(list(self.entities.__getitem__(entity_idx[0]))) - df = pd.DataFrame(entities, columns=['keys_idx', 'entity_id', 'entity_uri']) - - key_names = [] - for idx in df['keys_idx']: - key_id_val = self.keys.to_dataframe().iloc[int(idx)]['key'] - key_names.append(key_id_val) - - df['keys_idx'] = key_names - df = df.rename(columns={'keys_idx': 'key_names', 'entity_id': 'entity_id', 'entity_uri': 'entity_uri'}) + entity_key_row_idx = self.entity_keys.which(keys_idx=key_idx) + for row_idx in entity_key_row_idx: + entity_idx = self.entity_keys['entities_idx', row_idx] + entities.append(self.entities.__getitem__(entity_idx)) + df = pd.DataFrame(entities, columns=['entity_id', 'entity_uri']) return df @docval({'name': 'use_categories', 'type': bool, 'default': False, @@ -621,20 +700,13 @@ def to_dataframe(self, **kwargs): """ use_categories = popargs('use_categories', kwargs) - # Step 1: Combine the entities, keys, and files table - entities_df = self.entities.to_dataframe() - # Map the keys to the entities by 1) convert to dataframe, 2) select rows based on the keys_idx - # from the entities table, expanding the dataframe to have the same number of rows as the - # entities, and 3) reset the index to avoid duplicate values in the index, which causes errors when merging - keys_mapped_df = self.keys.to_dataframe().iloc[entities_df['keys_idx']].reset_index(drop=True) - # Map the resources to entities using the same strategy as for the keys - # resources_mapped_df = self.resources.to_dataframe().iloc[entities_df['resources_idx']].reset_index(drop=True) - # Merge the mapped keys and resources with the entities tables - entities_df = pd.concat(objs=[entities_df, keys_mapped_df], - axis=1, verify_integrity=False) - # Add a column for the entity id (for consistency with the other tables and to facilitate query) - entities_df['entities_idx'] = entities_df.index - + # Step 1: Combine the entities, keys, and entity_keys table + ent_key_df = self.entity_keys.to_dataframe() + entities_mapped_df = self.entities.to_dataframe().iloc[ent_key_df['entities_idx']].reset_index(drop=True) + keys_mapped_df = self.keys.to_dataframe().iloc[ent_key_df['keys_idx']].reset_index(drop=True) + ent_key_df = pd.concat(objs=[ent_key_df, entities_mapped_df, keys_mapped_df], + axis=1, + verify_integrity=False) # Step 2: Combine the the files, object_keys and objects tables object_keys_df = self.object_keys.to_dataframe() objects_mapped_df = self.objects.to_dataframe().iloc[object_keys_df['objects_idx']].reset_index(drop=True) @@ -650,7 +722,7 @@ def to_dataframe(self, **kwargs): # Create for each row in the objects_keys table a DataFrame with all corresponding data from all tables objs=[pd.merge( # Find all entities that correspond to the row i of the object_keys_table - entities_df[entities_df['keys_idx'] == object_keys_df['keys_idx'].iloc[i]].reset_index(drop=True), + ent_key_df[ent_key_df['keys_idx'] == object_keys_df['keys_idx'].iloc[i]].reset_index(drop=True), # Get a DataFrame for row i of the objects_keys_table file_object_object_key_df.iloc[[i, ]], # Merge the entities and object_keys on the keys_idx column so that the values from the single @@ -660,7 +732,6 @@ def to_dataframe(self, **kwargs): # Concatenate the rows of the objs axis=0, verify_integrity=False) - # Step 4: Clean up the index and sort columns by table type and name result_df.reset_index(inplace=True, drop=True) # ADD files @@ -693,16 +764,28 @@ def to_norm_tsv(self, **kwargs): """ Write the tables in ExternalResources to individual tsv files. """ - folder_path = kwargs['path'] - for child in self.children: - df = child.to_dataframe() - df.to_csv(folder_path+'/'+child.name+'.tsv', sep='\t', index=False) + path = kwargs['path'] + files = [path+child.name+'.tsv' for child in self.children] + + for i in range(len(self.children)): + df = self.children[i].to_dataframe() + df.to_csv(files[i], sep='\t', index=False) + + with zipfile.ZipFile('er.zip', 'w') as zipF: + for file in files: + zipF.write(file) + + # remove tsv files + for file in files: + os.remove(file) @classmethod @docval({'name': 'path', 'type': str, 'doc': 'path of the folder containing the tsv files to read'}, returns="ExternalResources loaded from TSV", rtype="ExternalResources") def from_norm_tsv(cls, **kwargs): path = kwargs['path'] + with zipfile.ZipFile(path+'/er.zip', 'r') as zip: + zip.extractall(path) tsv_paths = glob(path+'/*') for file in tsv_paths: @@ -710,173 +793,70 @@ def from_norm_tsv(cls, **kwargs): if file_name == 'files.tsv': files_df = pd.read_csv(file, sep='\t').replace(np.nan, '') files = FileTable().from_dataframe(df=files_df, name='files', extra_ok=False) + os.remove(file) continue if file_name == 'keys.tsv': keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') keys = KeyTable().from_dataframe(df=keys_df, name='keys', extra_ok=False) + os.remove(file) continue if file_name == 'entities.tsv': entities_df = pd.read_csv(file, sep='\t').replace(np.nan, '') entities = EntityTable().from_dataframe(df=entities_df, name='entities', extra_ok=False) + os.remove(file) continue if file_name == 'objects.tsv': objects_df = pd.read_csv(file, sep='\t').replace(np.nan, '') objects = ObjectTable().from_dataframe(df=objects_df, name='objects', extra_ok=False) + os.remove(file) continue if file_name == 'object_keys.tsv': object_keys_df = pd.read_csv(file, sep='\t').replace(np.nan, '') object_keys = ObjectKeyTable().from_dataframe(df=object_keys_df, name='object_keys', extra_ok=False) + os.remove(file) + continue + if file_name == 'entity_keys.tsv': + ent_key_df = pd.read_csv(file, sep='\t').replace(np.nan, '') + entity_keys = EntityKeyTable().from_dataframe(df=ent_key_df, name='entity_keys', extra_ok=False) + os.remove(file) continue # we need to check the idx columns in entities, objects, and object_keys - keys_idx = entities['keys_idx'] - for idx in keys_idx: - if not int(idx) < keys.__len__(): - msg = "Key Index out of range in EntityTable. Please check for alterations." + entity_idx = entity_keys['entities_idx'] + for idx in entity_idx: + if not int(idx) < len(entities): + msg = "Entity Index out of range in EntityTable. Please check for alterations." raise ValueError(msg) files_idx = objects['files_idx'] for idx in files_idx: - if not int(idx) < files.__len__(): + if not int(idx) < len(files): msg = "File_ID Index out of range in ObjectTable. Please check for alterations." raise ValueError(msg) object_idx = object_keys['objects_idx'] for idx in object_idx: - if not int(idx) < objects.__len__(): + if not int(idx) < len(objects): msg = "Object Index out of range in ObjectKeyTable. Please check for alterations." raise ValueError(msg) keys_idx = object_keys['keys_idx'] for idx in keys_idx: - if not int(idx) < keys.__len__(): + if not int(idx) < len(keys): msg = "Key Index out of range in ObjectKeyTable. Please check for alterations." raise ValueError(msg) + keys_idx = entity_keys['keys_idx'] + for idx in keys_idx: + if not int(idx) < len(keys): + msg = "Key Index out of range in EntityKeyTable. Please check for alterations." + raise ValueError(msg) + + er = ExternalResources(files=files, keys=keys, entities=entities, + entity_keys=entity_keys, objects=objects, object_keys=object_keys) return er - - @docval({'name': 'path', 'type': str, 'doc': 'path of the tsv file to write'}) - def to_flat_tsv(self, **kwargs): - """ - Write ExternalResources as a single, flat table to TSV - Internally, the function uses :py:meth:`pandas.DataFrame.to_csv`. Pandas can - infer compression based on the filename, i.e., by changing the file extension to - '.gz', '.bz2', '.zip', '.xz', or '.zst' we can write compressed files. - The TSV is formatted as follows: 1) line one indicates for each column the name of the table - the column belongs to, 2) line two is the name of the column within the table, 3) subsequent - lines are each a row in the flattened ExternalResources table. The first column is the - row id in the flattened table and does not have a label, i.e., the first and second - row will start with a tab character, and subsequent rows are numbered sequentially 1,2,3,... . - - See also :py:meth:`~hdmf.common.resources.ExternalResources.from_tsv` - """ # noqa: E501 - path = popargs('path', kwargs) - df = self.to_dataframe(use_categories=True) - df.to_csv(path, sep='\t') - - @classmethod - @docval({'name': 'path', 'type': str, 'doc': 'path of the tsv file to read'}, - returns="ExternalResources loaded from TSV", rtype="ExternalResources") - def from_flat_tsv(cls, **kwargs): - """ - Read ExternalResources from a flat tsv file - Formatting of the TSV file is assumed to be consistent with the format - generated by :py:meth:`~hdmf.common.resources.ExternalResources.to_tsv`. - The function attempts to validate that the data in the TSV is consistent - and parses the data from the denormalized table in the TSV to the - normalized linked table structure used by ExternalResources. - Currently the checks focus on ensuring that row id links between tables are valid. - Inconsistencies in other (non-index) fields (e.g., when two rows with the same resource_idx - have different resource_uri values) are not checked and will be ignored. In this case, the value - from the first row that contains the corresponding entry will be kept. - - .. note:: - Since TSV files may be edited by hand or other applications, it is possible that data - in the TSV may be inconsistent. E.g., object_idx may be missing if rows were removed - and ids not updated. Also since the TSV is flattened into a single denormalized table - (i.e., data are stored with duplication, rather than normalized across several tables), - it is possible that values may be inconsistent if edited outside. E.g., we may have - objects with the same index (object_idx) but different object_id, relative_path, or field - values. While flat TSVs are sometimes preferred for ease of sharing, editing - the TSV without using the :py:meth:`~hdmf.common.resources.ExternalResources` class - should be done with great care! - """ - def check_idx(idx_arr, name): - """Check that indices are consecutively numbered without missing values""" - idx_diff = np.diff(idx_arr) - if np.any(idx_diff != 1): - missing_idx = [i for i in range(np.max(idx_arr)) if i not in idx_arr] - msg = "Missing %s entries %s" % (name, str(missing_idx)) - raise ValueError(msg) - - path = popargs('path', kwargs) - df = pd.read_csv(path, header=[0, 1], sep='\t').replace(np.nan, '') - # Construct the ExternalResources - er = ExternalResources() - # Retrieve all the Files - files_idx, files_rows = np.unique(df[('objects', 'files_idx')], return_index=True) - file_order = np.argsort(files_idx) - files_idx = files_idx[file_order] - files_rows = files_rows[file_order] - # Check that files are consecutively numbered - check_idx(idx_arr=files_idx, name='files_idx') - files = df[('files', 'file_object_id')].iloc[files_rows] - for file in zip(files): - er._add_file(file_object_id=file[0]) - - # Retrieve all the objects - ob_idx, ob_rows = np.unique(df[('objects', 'objects_idx')], return_index=True) - # Sort objects based on their index - ob_order = np.argsort(ob_idx) - ob_idx = ob_idx[ob_order] - ob_rows = ob_rows[ob_order] - # Check that objects are consecutively numbered - check_idx(idx_arr=ob_idx, name='objects_idx') - # Add the objects to the Object table - ob_files = df[('objects', 'files_idx')].iloc[ob_rows] - ob_ids = df[('objects', 'object_id')].iloc[ob_rows] - ob_types = df[('objects', 'object_type')].iloc[ob_rows] - ob_relpaths = df[('objects', 'relative_path')].iloc[ob_rows] - ob_fields = df[('objects', 'field')].iloc[ob_rows] - for ob in zip(ob_files, ob_ids, ob_types, ob_relpaths, ob_fields): - er._add_object(files_idx=ob[0], container=ob[1], object_type=ob[2], relative_path=ob[3], field=ob[4]) - # Retrieve all keys - keys_idx, keys_rows = np.unique(df[('keys', 'keys_idx')], return_index=True) - # Sort keys based on their index - keys_order = np.argsort(keys_idx) - keys_idx = keys_idx[keys_order] - keys_rows = keys_rows[keys_order] - # Check that keys are consecutively numbered - check_idx(idx_arr=keys_idx, name='keys_idx') - # Add the keys to the Keys table - keys_key = df[('keys', 'key')].iloc[keys_rows] - all_added_keys = [er._add_key(k) for k in keys_key] - - # Add all the object keys to the ObjectKeys table. A single key may be assigned to multiple - # objects. As such it is not sufficient to iterate over the unique ob_rows with the unique - # objects, but we need to find all unique (objects_idx, keys_idx) combinations. - ob_keys_idx = np.unique(df[[('objects', 'objects_idx'), ('keys', 'keys_idx')]], axis=0) - for obk in ob_keys_idx: - er._add_object_key(obj=obk[0], key=obk[1]) - - # Retrieve all entities - entities_idx, entities_rows = np.unique(df[('entities', 'entities_idx')], return_index=True) - # Sort entities based on their index - entities_order = np.argsort(entities_idx) - entities_idx = entities_idx[entities_order] - entities_rows = entities_rows[entities_order] - # Check that entities are consecutively numbered - check_idx(idx_arr=entities_idx, name='entities_idx') - # Add the entities to the Resources table - entities_id = df[('entities', 'entity_id')].iloc[entities_rows] - entities_uri = df[('entities', 'entity_uri')].iloc[entities_rows] - entities_keys = np.array(all_added_keys)[df[('keys', 'keys_idx')].iloc[entities_rows]] - for e in zip(entities_keys, entities_id, entities_uri): - er._add_entity(key=e[0], entity_id=e[1], entity_uri=e[2]) - # Return the reconstructed ExternalResources - return er diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py index a278ad1a8..2b4587241 100644 --- a/tests/unit/common/test_resources.py +++ b/tests/unit/common/test_resources.py @@ -7,6 +7,8 @@ from tests.unit.build_tests.test_io_map import Bar from tests.unit.helpers.utils import create_test_type_map, CORE_NAMESPACE from hdmf.spec import GroupSpec, AttributeSpec, DatasetSpec +from glob import glob +import zipfile class ExternalResourcesManagerContainer(Container, ExternalResourcesManager): @@ -36,11 +38,24 @@ def setUpContainer(self): def remove_er_files(self): remove_test_file('./entities.tsv') + remove_test_file('./entity_keys.tsv') remove_test_file('./objects.tsv') remove_test_file('./object_keys.tsv') remove_test_file('./keys.tsv') remove_test_file('./files.tsv') remove_test_file('./er.tsv') + remove_test_file('./er.zip') + + def child_tsv(self, external_resources): + for child in external_resources.children: + df = child.to_dataframe() + df.to_csv('./'+child.name+'.tsv', sep='\t', index=False) + + def zip_child(self): + files = glob('*.tsv') + with zipfile.ZipFile('er.zip', 'w') as zipF: + for file in files: + zipF.write(file) def test_to_dataframe(self): # Setup complex external resources with keys reused across objects and @@ -55,16 +70,25 @@ def test_to_dataframe(self): ) ) - file = ExternalResourcesManagerContainer(name='file') + data2 = Data( + name='data_name', + data=np.array( + [('Mus musculus', 9, 81.0), ('Homo sapiens', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')] + ) + ) - ck1, e1 = er.add_ref(file=file, + file_1 = ExternalResourcesManagerContainer(name='file_1') + file_2 = ExternalResourcesManagerContainer(name='file_2') + + k1, e1 = er.add_ref(file=file_1, container=data1, field='species', key='Mus musculus', entity_id='NCBI:txid10090', entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') - k2, e2 = er.add_ref(file=file, - container=data1, + k2, e2 = er.add_ref(file=file_2, + container=data2, field='species', key='Homo sapiens', entity_id='NCBI:txid9606', @@ -73,10 +97,10 @@ def test_to_dataframe(self): # Convert to dataframe and compare against the expected result result_df = er.to_dataframe() expected_df_data = \ - {'file_object_id': {0: file.object_id, 1: file.object_id}, - 'objects_idx': {0: 0, 1: 0}, - 'object_id': {0: data1.object_id, 1: data1.object_id}, - 'files_idx': {0: 0, 1: 0}, + {'file_object_id': {0: file_1.object_id, 1: file_2.object_id}, + 'objects_idx': {0: 0, 1: 1}, + 'object_id': {0: data1.object_id, 1: data2.object_id}, + 'files_idx': {0: 0, 1: 1}, 'object_type': {0: 'Data', 1: 'Data'}, 'relative_path': {0: '', 1: ''}, 'field': {0: 'species', 1: 'species'}, @@ -196,7 +220,7 @@ def test_add_ref_search_for_file(self): er.add_ref(container=em, key='key1', entity_id='entity_id1', entity_uri='entity1') self.assertEqual(er.keys.data, [('key1',)]) - self.assertEqual(er.entities.data, [(0, 'entity_id1', 'entity1')]) + self.assertEqual(er.entities.data, [('entity_id1', 'entity1')]) self.assertEqual(er.objects.data, [(0, em.object_id, 'ExternalResourcesManagerContainer', '', '')]) def test_add_ref_search_for_file_parent(self): @@ -209,7 +233,7 @@ def test_add_ref_search_for_file_parent(self): er.add_ref(container=child, key='key1', entity_id='entity_id1', entity_uri='entity1') self.assertEqual(er.keys.data, [('key1',)]) - self.assertEqual(er.entities.data, [(0, 'entity_id1', 'entity1')]) + self.assertEqual(er.entities.data, [('entity_id1', 'entity1')]) self.assertEqual(er.objects.data, [(0, child.object_id, 'Container', '', '')]) def test_add_ref_search_for_file_nested_parent(self): @@ -224,7 +248,7 @@ def test_add_ref_search_for_file_nested_parent(self): er.add_ref(container=nested_child, key='key1', entity_id='entity_id1', entity_uri='entity1') self.assertEqual(er.keys.data, [('key1',)]) - self.assertEqual(er.entities.data, [(0, 'entity_id1', 'entity1')]) + self.assertEqual(er.entities.data, [('entity_id1', 'entity1')]) self.assertEqual(er.objects.data, [(0, nested_child.object_id, 'Container', '', '')]) def test_add_ref_search_for_file_error(self): @@ -246,7 +270,7 @@ def test_add_ref(self): entity_id='entity_id1', entity_uri='entity1') self.assertEqual(er.keys.data, [('key1',)]) - self.assertEqual(er.entities.data, [(0, 'entity_id1', 'entity1')]) + self.assertEqual(er.entities.data, [('entity_id1', 'entity1')]) self.assertEqual(er.objects.data, [(0, data.object_id, 'Data', '', '')]) def test_get_object_type(self): @@ -313,7 +337,19 @@ def test_get_object_type_all_instances(self): 'entities_idx': 'uint32'}) pd.testing.assert_frame_equal(df, expected_df) - def test_get_entities(self): + def test_get_entity(self): + er = ExternalResources() + data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) + file = ExternalResourcesManagerContainer(name='file') + er.add_ref(file=file, + container=data, + key='key1', + entity_id='entity_id1', + entity_uri='entity1') + self.assertEqual(er.get_entity(entity_id='entity_id1').idx, 0) + self.assertEqual(er.get_entity(entity_id='entity_id2'), None) + + def test_get_obj_entities(self): er = ExternalResources() data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) file = ExternalResourcesManagerContainer(name='file') @@ -326,14 +362,13 @@ def test_get_entities(self): df = er.get_object_entities(file=file, container=data) expected_df_data = \ - {'key_names': {0: 'key1'}, - 'entity_id': {0: 'entity_id1'}, + {'entity_id': {0: 'entity_id1'}, 'entity_uri': {0: 'entity1'}} expected_df = pd.DataFrame.from_dict(expected_df_data) pd.testing.assert_frame_equal(df, expected_df) - def test_get_entities_file_none_container(self): + def test_get_obj_entities_file_none_container(self): er = ExternalResources() file = ExternalResourcesManagerContainer() er.add_ref(container=file, @@ -343,14 +378,13 @@ def test_get_entities_file_none_container(self): df = er.get_object_entities(container=file) expected_df_data = \ - {'key_names': {0: 'key1'}, - 'entity_id': {0: 'entity_id1'}, + {'entity_id': {0: 'entity_id1'}, 'entity_uri': {0: 'entity1'}} expected_df = pd.DataFrame.from_dict(expected_df_data) pd.testing.assert_frame_equal(df, expected_df) - def test_get_entities_file_none_not_container_nested(self): + def test_get_obj_entities_file_none_not_container_nested(self): er = ExternalResources() file = ExternalResourcesManagerContainer() child = Container(name='child') @@ -364,14 +398,13 @@ def test_get_entities_file_none_not_container_nested(self): df = er.get_object_entities(container=child) expected_df_data = \ - {'key_names': {0: 'key1'}, - 'entity_id': {0: 'entity_id1'}, + {'entity_id': {0: 'entity_id1'}, 'entity_uri': {0: 'entity1'}} expected_df = pd.DataFrame.from_dict(expected_df_data) pd.testing.assert_frame_equal(df, expected_df) - def test_get_entities_file_none_not_container_deep_nested(self): + def test_get_obj_entities_file_none_not_container_deep_nested(self): er = ExternalResources() file = ExternalResourcesManagerContainer() child = Container(name='child') @@ -387,14 +420,13 @@ def test_get_entities_file_none_not_container_deep_nested(self): df = er.get_object_entities(container=nested_child) expected_df_data = \ - {'key_names': {0: 'key1'}, - 'entity_id': {0: 'entity_id1'}, + {'entity_id': {0: 'entity_id1'}, 'entity_uri': {0: 'entity1'}} expected_df = pd.DataFrame.from_dict(expected_df_data) pd.testing.assert_frame_equal(df, expected_df) - def test_get_entities_file_none_error(self): + def test_get_obj_entities_file_none_error(self): er = ExternalResources() data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) file = ExternalResourcesManagerContainer(name='file') @@ -406,7 +438,7 @@ def test_get_entities_file_none_error(self): with self.assertRaises(ValueError): _ = er.get_object_entities(container=data) - def test_get_entities_attribute(self): + def test_get_obj_entities_attribute(self): table = DynamicTable(name='table', description='table') table.add_column(name='col1', description="column") table.add_row(id=0, col1='data') @@ -425,8 +457,7 @@ def test_get_entities_attribute(self): attribute='col1') expected_df_data = \ - {'key_names': {0: 'key1'}, - 'entity_id': {0: 'entity_0'}, + {'entity_id': {0: 'entity_0'}, 'entity_uri': {0: 'entity_0_uri'}} expected_df = pd.DataFrame.from_dict(expected_df_data) @@ -457,12 +488,61 @@ def test_to_and_from_norm_tsv_entity_value_error(self): entity_uri='entity1') er.to_norm_tsv(path='./') + self.child_tsv(external_resources=er) + df = er.entities.to_dataframe() df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10 df.to_csv('./entities.tsv', sep='\t', index=False) - msg = "Key Index out of range in EntityTable. Please check for alterations." - with self.assertRaisesWith(ValueError, msg): + self.zip_child() + + with self.assertRaises(ValueError): + _ = ExternalResources.from_norm_tsv(path='./') + + self.remove_er_files() + + def test_to_and_from_norm_tsv_entity_key_value_error_key(self): + er = ExternalResources() + data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data, + key='key1', + entity_id='entity_id1', + entity_uri='entity1') + er.to_norm_tsv(path='./') + + self.child_tsv(external_resources=er) + + df = er.entity_keys.to_dataframe() + df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10 + df.to_csv('./entity_keys.tsv', sep='\t', index=False) + + self.zip_child() + + with self.assertRaises(ValueError): + _ = ExternalResources.from_norm_tsv(path='./') + + self.remove_er_files() + + def test_to_and_from_norm_tsv_entity_key_value_error_entity(self): + er = ExternalResources() + data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data, + key='key1', + entity_id='entity_id1', + entity_uri='entity1') + er.to_norm_tsv(path='./') + + self.child_tsv(external_resources=er) + + df = er.entity_keys.to_dataframe() + df.at[0, ('entities_idx')] = 10 # Change key_ix 0 to 10 + df.to_csv('./entity_keys.tsv', sep='\t', index=False) + + self.zip_child() + + with self.assertRaises(ValueError): _ = ExternalResources.from_norm_tsv(path='./') self.remove_er_files() @@ -477,10 +557,14 @@ def test_to_and_from_norm_tsv_object_value_error(self): entity_uri='entity1') er.to_norm_tsv(path='./') + self.child_tsv(external_resources=er) + df = er.objects.to_dataframe() df.at[0, ('files_idx')] = 10 # Change key_ix 0 to 10 df.to_csv('./objects.tsv', sep='\t', index=False) + self.zip_child() + msg = "File_ID Index out of range in ObjectTable. Please check for alterations." with self.assertRaisesWith(ValueError, msg): _ = ExternalResources.from_norm_tsv(path='./') @@ -497,10 +581,14 @@ def test_to_and_from_norm_tsv_object_keys_object_idx_value_error(self): entity_uri='entity1') er.to_norm_tsv(path='./') + self.child_tsv(external_resources=er) + df = er.object_keys.to_dataframe() df.at[0, ('objects_idx')] = 10 # Change key_ix 0 to 10 df.to_csv('./object_keys.tsv', sep='\t', index=False) + self.zip_child() + msg = "Object Index out of range in ObjectKeyTable. Please check for alterations." with self.assertRaisesWith(ValueError, msg): _ = ExternalResources.from_norm_tsv(path='./') @@ -517,62 +605,20 @@ def test_to_and_from_norm_tsv_object_keys_key_idx_value_error(self): entity_uri='entity1') er.to_norm_tsv(path='./') + self.child_tsv(external_resources=er) + df = er.object_keys.to_dataframe() df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10 df.to_csv('./object_keys.tsv', sep='\t', index=False) + self.zip_child() + msg = "Key Index out of range in ObjectKeyTable. Please check for alterations." with self.assertRaisesWith(ValueError, msg): _ = ExternalResources.from_norm_tsv(path='./') self.remove_er_files() - def test_to_flat_tsv_and_from_flat_tsv(self): - # write er to file - er = ExternalResources() - data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) - er.add_ref(file=ExternalResourcesManagerContainer(name='file'), - container=data, - key='key1', - entity_id='entity_id1', - entity_uri='entity1') - er.to_flat_tsv(path='./er.tsv') - # read er back from file and compare - er_obj = ExternalResources.from_flat_tsv(path='./er.tsv') - # Check that the data is correct - ExternalResources.assert_external_resources_equal(er_obj, er, check_dtype=False) - self.remove_er_files() - - def test_to_flat_tsv_and_from_flat_tsv_missing_keyidx(self): - # write er to file - df = self.container.to_dataframe(use_categories=True) - df.at[0, ('keys', 'keys_idx')] = 10 # Change key_ix 0 to 10 - df.to_csv(self.export_filename, sep='\t') - # read er back from file and compare - msg = "Missing keys_idx entries [0, 2, 3, 4, 5, 6, 7, 8, 9]" - with self.assertRaisesWith(ValueError, msg): - _ = ExternalResources.from_flat_tsv(path=self.export_filename) - - def test_to_flat_tsv_and_from_flat_tsv_missing_objectidx(self): - # write er to file - df = self.container.to_dataframe(use_categories=True) - df.at[0, ('objects', 'objects_idx')] = 10 # Change objects_idx 0 to 10 - df.to_csv(self.export_filename, sep='\t') - # read er back from file and compare - msg = "Missing objects_idx entries [0, 2, 3, 4, 5, 6, 7, 8, 9]" - with self.assertRaisesWith(ValueError, msg): - _ = ExternalResources.from_flat_tsv(path=self.export_filename) - - def test_to_flat_tsv_and_from_flat_tsv_missing_entitiesidx(self): - # write er to file - er_df = self.container.to_dataframe(use_categories=True) - er_df.at[0, ('entities', 'entities_idx')] = 10 # Change entities_idx 0 to 10 - er_df.to_csv(self.export_filename, sep='\t') - # read er back from file and compare - msg = "Missing entities_idx entries [0, 2, 3, 4, 5, 6, 7, 8, 9]" - with self.assertRaisesWith(ValueError, msg): - _ = ExternalResources.from_flat_tsv(path=self.export_filename) - def test_add_ref_two_keys(self): er = ExternalResources() ref_container_1 = Container(name='Container_1') @@ -589,7 +635,7 @@ def test_add_ref_two_keys(self): entity_uri='url21') self.assertEqual(er.keys.data, [('key1',), ('key2',)]) - self.assertEqual(er.entities.data, [(0, 'id11', 'url11'), (1, 'id12', 'url21')]) + self.assertEqual(er.entities.data, [('id11', 'url11'), ('id12', 'url21')]) self.assertEqual(er.objects.data, [(0, ref_container_1.object_id, 'Container', '', ''), (1, ref_container_2.object_id, 'Container', '', '')]) @@ -610,7 +656,7 @@ def test_add_ref_same_key_diff_objfield(self): entity_uri='url21') self.assertEqual(er.keys.data, [('key1',), ('key1',)]) - self.assertEqual(er.entities.data, [(0, 'id11', 'url11'), (1, 'id12', 'url21')]) + self.assertEqual(er.entities.data, [('id11', 'url11'), ('id12', 'url21')]) self.assertEqual(er.objects.data, [(0, ref_container_1.object_id, 'Container', '', ''), (1, ref_container_2.object_id, 'Container', '', '')]) @@ -637,9 +683,9 @@ def test_add_ref_same_keyname(self): self.assertEqual(er.keys.data, [('key1',), ('key1',), ('key1',)]) self.assertEqual( er.entities.data, - [(0, 'id11', 'url11'), - (1, 'id12', 'url21'), - (2, 'id13', 'url31')]) + [('id11', 'url11'), + ('id12', 'url21'), + ('id13', 'url31')]) self.assertEqual(er.objects.data, [(0, ref_container_1.object_id, 'Container', '', ''), (1, ref_container_2.object_id, 'Container', '', ''), (2, ref_container_3.object_id, 'Container', '', '')]) @@ -660,9 +706,163 @@ def test_object_key_unqiueness(self): key=existing_key, entity_id='entity2', entity_uri='entity_uri2') - self.assertEqual(er.object_keys.data, [(0, 0)]) + def test_object_key_existing_key_new_object(self): + er = ExternalResources() + data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + data_2 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key='Mus musculus', + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + existing_key = er.get_key('Mus musculus') + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_2, + key=existing_key, + entity_id='entity2', + entity_uri='entity_uri2') + self.assertEqual(er.object_keys.data, [(0, 0), (1, 0)]) + + def test_object_key_existing_key_new_object_error(self): + er = ExternalResources() + data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key='Mus musculus', + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + key = er._add_key('key') + with self.assertRaises(ValueError): + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key=key, + entity_id='entity1', + entity_uri='entity_uri1') + + def test_reuse_key_reuse_entity(self): + # With the key and entity existing, the EntityKeyTable should not have duplicates + er = ExternalResources() + data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + data_2 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key='Mus musculus', + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + existing_key = er.get_key('Mus musculus') + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_2, + key=existing_key, + entity_id='NCBI:txid10090') + + self.assertEqual(er.entity_keys.data, [(0, 0)]) + + def test_resuse_entity_different_key(self): + # The EntityKeyTable should have two rows: same entity_idx, but different key_idx + er = ExternalResources() + data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + data_2 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key='Mus musculus', + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_2, + key='mouse', + entity_id='NCBI:txid10090') + self.assertEqual(er.entity_keys.data, [(0, 0), (0, 1)]) + + def test_reuse_key_reuse_entity_new(self): + er = ExternalResources() + data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + data_2 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key='Mus musculus', + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key='Mice', + entity_id='entity_2', + entity_uri='entity_2_uri') + existing_key = er.get_key('Mus musculus') + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_2, + key=existing_key, + entity_id='entity_2') + + self.assertEqual(er.entity_keys.data, [(0, 0), (1, 1), (1, 0)]) + + def test_entity_uri_error(self): + er = ExternalResources() + data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + with self.assertRaises(ValueError): + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key='Mus musculus', + entity_id='NCBI:txid10090') + + def test_entity_uri_reuse_error(self): + er = ExternalResources() + data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + data_2 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key='Mus musculus', + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + existing_key = er.get_key('Mus musculus') + with self.assertRaises(ValueError): + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_2, + key=existing_key, + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + + def test_key_without_entity_error(self): + er = ExternalResources() + data_1 = Data(name='data_name', data=np.array([('Mus musculus', 9, 81.0), ('Homo sapien', 3, 27.0)], + dtype=[('species', 'U14'), ('age', 'i4'), ('weight', 'f4')])) + + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key='Mus musculus', + entity_id='NCBI:txid10090', + entity_uri='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090') + key = er._add_key('key') + with self.assertRaises(ValueError): + er.add_ref(file=ExternalResourcesManagerContainer(name='file'), + container=data_1, + key=key, + entity_id='entity1') + def test_check_object_field_add(self): er = ExternalResources() data = Data(name="species", data=['Homo sapiens', 'Mus musculus']) @@ -725,7 +925,7 @@ def test_add_ref_attribute(self): entity_uri='entity_0_uri') self.assertEqual(er.keys.data, [('key1',)]) - self.assertEqual(er.entities.data, [(0, 'entity_0', 'entity_0_uri')]) + self.assertEqual(er.entities.data, [('entity_0', 'entity_0_uri')]) self.assertEqual(er.objects.data, [(0, table.id.object_id, 'ElementIdentifiers', '', '')]) def test_add_ref_column_as_attribute(self): @@ -744,7 +944,7 @@ def test_add_ref_column_as_attribute(self): entity_uri='entity_0_uri') self.assertEqual(er.keys.data, [('key1',)]) - self.assertEqual(er.entities.data, [(0, 'entity_0', 'entity_0_uri')]) + self.assertEqual(er.entities.data, [('entity_0', 'entity_0_uri')]) self.assertEqual(er.objects.data, [(0, table['col1'].object_id, 'VectorData', '', '')]) def test_add_ref_compound_data(self): @@ -763,7 +963,7 @@ def test_add_ref_compound_data(self): entity_uri='entity_0_uri') self.assertEqual(er.keys.data, [('Mus musculus',)]) - self.assertEqual(er.entities.data, [(0, 'NCBI:txid10090', 'entity_0_uri')]) + self.assertEqual(er.entities.data, [('NCBI:txid10090', 'entity_0_uri')]) self.assertEqual(er.objects.data, [(0, data.object_id, 'Data', '', 'species')]) def test_roundtrip(self): @@ -817,7 +1017,7 @@ def test_add_ref_nested(self): entity_id='entity_0', entity_uri='entity_0_uri') self.assertEqual(er.keys.data, [('key1',)]) - self.assertEqual(er.entities.data, [(0, 'entity_0', 'entity_0_uri')]) + self.assertEqual(er.entities.data, [('entity_0', 'entity_0_uri')]) self.assertEqual(er.objects.data, [(0, table.object_id, 'DynamicTable', 'description', '')]) def test_add_ref_deep_nested(self):