hdmf-dev · mavaylon1 · May 12, 2023 · May 12, 2023 · May 12, 2023 · May 12, 2023
diff --git a/docs/gallery/example_term_set.yaml b/docs/gallery/example_term_set.yaml
@@ -0,0 +1,24 @@
+id: pynert/termset/species_example
+name: Species
+prefixes:
+  NCBI_TAXON: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=
+  Ensemble: https://rest.ensembl.org/taxonomy/id/
+imports:
+  - linkml:types
+default_range: string
+
+enums:
+  Species:
+    permissible_values:
+      Homo sapiens:
+        description: description
+        meaning: NCBI_TAXON:9606
+      Mus musculus:
+        description: description
+        meaning: Ensemble:10090
+      Ursus arctos horribilis:
+        description: description
+        meaning: NCBI_TAXON:116960
+      Myrmecophaga tridactyla:
+        description: description
+        meaning: NCBI_TAXON:71006
diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py
@@ -320,3 +320,33 @@ def __init__(self, **kwargs):
 
 er_read = ExternalResources.from_flat_tsv(path='./er_example.tsv')
 remove_test_file('./er_example.tsv')
+
+###############################################################################
+# Using TermSet with ExternalResources
+# ------------------------------------------------
+# :py:class:`~hdmf.TermSet` allows for an easier way to add references to
+# :py:class:`~hdmf.common.resources.ExternalResources`. These enumerations take place of the
+# entity_id and entity_uri parameters. :py:class:`~hdmf.common.resources.Key` values will have to match the name of the term
+# in the :py:class:`~hdmf.TermSet`.
+try:
+    import linkml_runtime  # noqa: F401
+    LINKML_INSTALLED = True
+except ImportError:
+    LINKML_INSTALLED = False
+
+if LINKML_INSTALLED:
+    from hdmf.term_set import TermSet
+
+    terms = TermSet(name='Species', term_schema_path='docs/gallery/example_term_set.yaml')
+    col1 = VectorData(
+        name='Species_Data',
+        description='...',
+        data=['Homo sapiens', 'Ursus arctos horribilis'],
+        term_set=terms,
+    )
+
+    species = DynamicTable(name='species', description='My species', columns=[col1],)
+    er.add_ref_term_set(file=file,
+                        container=species,
+                        attribute='Species_Data',
+                       )
diff --git a/docs/gallery/plot_term_set.py b/docs/gallery/plot_term_set.py
@@ -0,0 +1,148 @@
+"""
+TermSet
+=======
+
+This is a user guide for interacting with the
+:py:class:`~hdmf.TermSet` class. The TermSet type
+is experimental and is subject to change in future releases. If you use this type,
+please provide feedback to the HDMF team so that we can improve the structure and
+overall capabilities.
+
+Introduction
+-------------
+The :py:class:`~hdmf.TermSet` class provides a way for users to create their own
+set of terms from brain atlases, species taxonomies, and anatomical, cell, and
+gene function ontologies.
+
+:py:class:`~hdmf.TermSet` serves two purposes: data validation and external reference
+management. Users will be able to validate their data to their own set of terms, ensuring
+clean data to be used inline with the FAIR principles later on.
+The  :py:class:`~hdmf.TermSet` class allows for a reusable and sharable
+pool of metadata to serve as references to any dataset within the NWB ecosystem.
+The :py:class:`~hdmf.TermSet` class is used closely with
+:py:class:`~hdmf.common.resources.ExternalResources` to more efficiently map terms
+to data. Please refer to the tutorial on ExternalResources to see how :py:class:`~hdmf.TermSet`
+is used with :py:class:`~hdmf.common.resources.ExternalResources`.
+
+:py:class:`~hdmf.TermSet` is built upon the resources from LinkmL, a modeling
+language to create YAML schemas, giving :py:class:`~hdmf.TermSet`
+a standardized structure and a variety of tools to help the user manage their references.
+
+How to make a TermSet Schema
+----------------------------
+Before the user can take advantage of all the wonders within the
+:py:class:`~hdmf.TermSet` class, the user needs to a LinkML schema (YAML) that provides
+all the permissible term values. Please refer to https://linkml.io/linkml/intro/tutorial06.html
+to learn more about how LinkML structures their schemas.
+
+1. The name of the schema is up to the user, e.g., the name could be "Species" if the term set will
+   contain species terms.
+2. The prefixes will be the standardized prefix of your source, followed by the URI to the terms.
+   For example, the NCBI Taxonomy is abbreviated as NCBI_TAXON, and Ensemble is simply Ensemble.
+   As mentioned prior, the URI needs to be to the terms; this is to allow the URI to later be coupled
+   with the source id for the term to create a valid link to the term source page. In the case of
+   Ensemble, it would be "https://rest.ensembl.org/taxonomy/id/".
+3. The schema uses LinkML enumerations to list all the possible terms. Currently, users will need to
+   manually outline the terms within the enumeration's permissible values.
+
+For a clear example, please refer to example_term_set.yaml within the tutorial gallery.
+"""
+######################################################
+# Creating an instance of the TermSet class
+# ----------------------------------------------------
+from hdmf.common import ExternalResources, DynamicTable, VectorData
+try:
+    import linkml_runtime  # noqa: F401
+    LINKML_INSTALLED = True
+except ImportError:
+    LINKML_INSTALLED = False
+
+if LINKML_INSTALLED:
+    from hdmf.term_set import TermSet
+
+######################################################
+# Viewing TermSet values
+# ----------------------------------------------------
+# :py:class:`~hdmf.TermSet` has methods to retrieve terms. The :py:func:`~hdmf.TermSet:view_set`
+# method will return a dictionary of all the terms and the corresponding information for each term.
+# Users can index specific terms from the :py:class:`~hdmf.TermSet`.
+if LINKML_INSTALLED:
+    terms = TermSet(name='Species', term_schema_path='docs/gallery/example_term_set.yaml')
+    terms.view_set
+
+    # Retrieve a specific term
+    terms['Homo sapiens']
+
+######################################################
+# Validate Data with TermSet
+# ----------------------------------------------------
+# :py:class:`~hdmf.TermSet` has been integrated so that :py:class:`~hdmf.Data` and its
+# subclasses support a term_set attribute. By having this attribute set, the data will be validated
+# and all new data will be validated.
+if LINKML_INSTALLED:
+    data = VectorData(
+        name='species',
+        description='...',
+        data=['Homo sapiens'],
+        term_set=terms)
+
+######################################################
+# Validate on append with TermSet
+# ----------------------------------------------------
+# As mentioned prior, when the term_set attribute is set all new data is validated. This true for both
+# append and extend methods.
+if LINKML_INSTALLED:
+    data.append('Ursus arctos horribilis')
+    data.extend(['Mus musculus', 'Myrmecophaga tridactyla'])
+
+######################################################
+# Validate Data in a DynamicTable with TermSet
+# ----------------------------------------------------
+# Validating data with :py:class:`~hdmf.common.table.DynamicTable` is determined by which columns were
+# initialized with the term_set attribute set. The data is validated when the columns are created and not
+# when set as columns to the table.
+if LINKML_INSTALLED:
+    col1 = VectorData(
+        name='Species_1',
+        description='...',
+        data=['Homo sapiens'],
+        term_set=terms,
+    )
+    col2 = VectorData(
+        name='Species_2',
+        description='...',
+        data=['Mus musculus'],
+        term_set=terms,
+    )
+    species = DynamicTable(name='species', description='My species', columns=[col1,col2])
+
+######################################################
+# Validate new rows in a DynamicTable with TermSet
+# ----------------------------------------------------
+# Validating new rows to :py:class:`~hdmf.common.table.DynamicTable` is simple. The
+# :py:func:`~hdmf.common.table.DynamicTable.add_row` method will automatically check each column for a
+# :py:class:`~hdmf.TermSet` (via the term_set attribute). If the attribute is set, the the data will be
+# validated for that column using that column's :py:class:`~hdmf.TermSet`. If their is invalid data, the
+# row will not be added and the user will be prompted to fix the new data in order to populate the table.
+if LINKML_INSTALLED:
+    species.add_row(Species_1='Mus musculus', Species_2='Mus musculus')
+
+######################################################
+# Validate new columns in a DynamicTable with TermSet
+# ----------------------------------------------------
+# As mentioned prior, validating in a :py:class:`~hdmf.common.table.DynamicTable` is determined
+# by the columns. The :py:func:`~hdmf.common.table.DynamicTable.add_column` method has a term_set attribute
+# as if you were making a new instance of :py:class:`~hdmf.common.table.VectorData`. When set, this attribute
+# will be used to validate the data. The column will not be added if there is invalid data.
+if LINKML_INSTALLED:
+    col1 = VectorData(
+        name='Species_1',
+        description='...',
+        data=['Homo sapiens'],
+        term_set=terms,
+    )
+    species = DynamicTable(name='species', description='My species', columns=[col1])
+    species.add_column(name='Species_2',
+                       description='Species data',
+                       data=['Mus musculus'],
+                       term_set=terms)
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "ruamel.yaml>=0.16",
     "scipy>=1.1",
     "importlib-metadata<4.3; python_version < '3.8'",  # TODO: remove when minimum python version is 3.8
-    "importlib-resources; python_version < '3.9'",  # TODO: remove when minimum python version is 3.9
+    "importlib-resources; python_version < '3.9'", # TODO: remove when minimum python version is 3.9
 ]
 dynamic = ["version"]
 

diff --git a/requirements-min.txt b/requirements-min.txt
@@ -2,7 +2,7 @@
 h5py==2.10  # support for selection of datasets with list of indices added in 2.10
 importlib-metadata==4.2.0; python_version < "3.8"  # TODO: remove when minimum python version is 3.8
 importlib-resources==5.12.0; python_version < "3.9"  # TODO: remove when when minimum python version is 3.9
-jsonschema==2.6.0
+jsonschema>=2.6.0
 numpy==1.16  # numpy>=1.16,<1.18 does not provide wheels for python 3.8 and does not build well on windows
 pandas==1.0.5  # when this is changed to >=1.5.0, see TODO items referenced in #762
 ruamel.yaml==0.16

diff --git a/src/hdmf/__init__.py b/src/hdmf/__init__.py
@@ -3,6 +3,7 @@
 from .container import Container, Data, DataRegion, ExternalResourcesManager
 from .region import ListSlicer
 from .utils import docval, getargs
+from .term_set import TermSet
 
 
 @docval(

diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py
@@ -2,7 +2,8 @@
 import numpy as np
 from . import register_class, EXP_NAMESPACE
 from . import get_type_map
-from ..container import Table, Row, Container, AbstractContainer, ExternalResourcesManager
+from ..container import Table, Row, Container, AbstractContainer, Data, ExternalResourcesManager
+from ..data_utils import DataIO
 from ..utils import docval, popargs, AllowPositional
 from ..build import TypeMap
 from glob import glob
@@ -350,31 +351,6 @@ def _check_object_field(self, **kwargs):
             raise ValueError("Found multiple instances of the same object id, relative path, "
                              "and field in objects table.")
 
-    @docval({'name': 'container', 'type': (str, AbstractContainer),
-             'doc': ('The Container/Data object that uses the key or '
-                     'the object id for the Container/Data object that uses the key.')})
-    def _get_file_from_container(self, **kwargs):
-        """
-        Method to retrieve a file associated with the container in the case a file is not provided.
-        """
-        container = kwargs['container']
-
-        if isinstance(container, ExternalResourcesManager):
-            file = container
-            return file
-        else:
-            parent = container.parent
-            if parent is not None:
-                while parent is not None:
-                    if isinstance(parent, ExternalResourcesManager):
-                        file = parent
-                        return file
-                    else:
-                        parent = parent.parent
-            else:
-                msg = 'Could not find file. Add container to the file.'
-                raise ValueError(msg)
-
     @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'},
             {'name': 'file', 'type': ExternalResourcesManager, 'doc': 'The file associated with the container.',
              'default': None},
@@ -518,7 +494,100 @@ def add_ref(self, **kwargs):
 
         entity = self._add_entity(key, entity_id, entity_uri)
 
-        return key, entity
+        return True
+
+    @docval({'name': 'container', 'type': (str, AbstractContainer),
+             'doc': ('The Container/Data object that uses the key or '
+                     'the object id for the Container/Data object that uses the key.')})
+    def _get_file_from_container(self, **kwargs):
+        """
+        Method to retrieve a file associated with the container in the case a file is not provided.
+        """
+        container = kwargs['container']
+
+        if isinstance(container, ExternalResourcesManager):
+            file = container
+            return file
+        else:
+            parent = container.parent
+            if parent is not None:
+                while parent is not None:
+                    if isinstance(parent, ExternalResourcesManager):
+                        file = parent
+                        return file
+                    else:
+                        parent = parent.parent
+            else:
+                msg = 'Could not find file. Add container to the file.'
+                raise ValueError(msg)
+
+    @docval({'name': 'file',  'type': ExternalResourcesManager, 'doc': 'The file associated with the container.',
+             'default': None},
+            {'name': 'container', 'type': (str, AbstractContainer), 'default': None,
+             'doc': ('The Container/Data object that uses the key or '
+                     'the object_id for the Container/Data object that uses the key.')},
+            {'name': 'attribute', 'type': str,
+             'doc': 'The attribute of the container for the external reference.', 'default': None},
+            {'name': 'field', 'type': str, 'default': '',
+             'doc': ('The field of the compound data type using an external resource.')},
+            {'name': 'key', 'type': (str, Key), 'default': None,
+             'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'},
+            )
+    def add_ref_term_set(self, **kwargs):
+        file = kwargs['file']
+        container = kwargs['container']
+        attribute = kwargs['attribute']
+        key = kwargs['key']
+        field = kwargs['field']
+
+        if attribute is None:
+            try:
+                term_set = container.term_set
+            except AttributeError:
+                msg = "Cannot Find TermSet"
+                raise AttributeError(msg)
+        else:
+            term_set = container[attribute].term_set
+            if term_set is None:
+                msg = "Cannot Find TermSet"
+                raise ValueError(msg)
+
+        if file is None:
+            file = self._get_file_from_container(container=container)
+
+        # if key is provided then add_ref proceeds as normal
+        # use key provided as the term in the term_set for entity look-up
+        if key is not None:
+            data = [key]
+        else:
+            if attribute is None:
+                data_object = container
+            else:
+                data_object = container[attribute]
+            if isinstance(data_object, (Data, DataIO)):
+                data = data_object.data
+            elif isinstance(data_object, (list, np.ndarray)):
+                data = data_object
+        missing_terms = []
+        for term in data:
+            try:
+                term_info = term_set[term]
+            except ValueError:
+                missing_terms.append(term)
+                continue
+            entity_id = term_info[0]
+            entity_uri = term_info[2]
+            self.add_ref(file=file,
+                         container=container,
+                         attribute=attribute,
+                         key=term,
+                         field=field,
+                         entity_id=entity_id,
+                         entity_uri=entity_uri)
+        if len(missing_terms)>0:
+            return {"Missing Values in TermSet": missing_terms}
+        else:
+            return True
 
     @docval({'name': 'object_type', 'type': str,
              'doc': 'The type of the object. This is also the parent in relative_path.'},