Merge branch 'dev' into dependabot/github_actions/actions/add-to-proj…

…ect-1.0.2
hdmf-dev · Oct 2, 2024 · d6133de · d6133de
2 parents d053c7f + 335abdd
commit d6133de
Show file tree

Hide file tree

Showing 30 changed files with 1,261 additions and 89 deletions.
diff --git a/.github/workflows/run_all_tests.yml b/.github/workflows/run_all_tests.yml
@@ -165,13 +165,12 @@ jobs:
           auto-update-conda: true
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
-          mamba-version: "*"
 
       - name: Install build dependencies
         run: |
           conda config --set always_yes yes --set changeps1 no
           conda info
-          mamba install -c conda-forge "tox>=4"
+          conda install -c conda-forge "tox>=4"
 
       - name: Conda reporting
         run: |
@@ -229,7 +228,6 @@ jobs:
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
           auto-activate-base: false
-          mamba-version: "*"
 
       - name: Install run dependencies
         run: |

diff --git a/.github/workflows/run_coverage.yml b/.github/workflows/run_coverage.yml
@@ -101,7 +101,6 @@ jobs:
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
           auto-activate-base: false
-          mamba-version: "*"
 
       - name: Install run dependencies
         run: |

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -139,13 +139,12 @@ jobs:
           auto-update-conda: true
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
-          mamba-version: "*"
 
       - name: Install build dependencies
         run: |
           conda config --set always_yes yes --set changeps1 no
           conda info
-          mamba install -c conda-forge "tox>=4"
+          conda install -c conda-forge "tox>=4"
 
       - name: Conda reporting
         run: |
@@ -239,7 +238,6 @@ jobs:
           python-version: ${{ matrix.python-ver }}
           channels: conda-forge
           auto-activate-base: false
-          mamba-version: "*"
 
       - name: Install run dependencies
         run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
 #     hooks:
 #     -   id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.9
+    rev: v0.6.8
     hooks:
     -   id: ruff
 # -   repo: https://github.com/econchick/interrogate

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,49 @@
 # HDMF Changelog
 
-## HDMF 3.14.2 (Upcoming)
+## HDMF 3.14.6 (Upcoming)
+
+### Bug fixes
+- Fixed mamba-related error in conda-based GitHub Actions. @rly [#1194](https://github.com/hdmf-dev/hdmf/pull/1194)
+
+## HDMF 3.14.5 (September 17, 2024)
+
+### Enhancements
+- Added support for overriding backend configurations of `h5py.Dataset` objects in `Container.set_data_io`. @pauladkisson [#1172](https://github.com/hdmf-dev/hdmf/pull/1172)
+
+### Bug fixes
+- Fixed bug in writing of string arrays to an HDF5 file that were read from an HDF5 file that was introduced in 3.14.4. @rly @stephprince
+  [#1189](https://github.com/hdmf-dev/hdmf/pull/1189)
+
+## HDMF 3.14.4 (September 4, 2024)
+
+### Enhancements
+- Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157)
+- Adjusted stacklevel of warnings to point to user code when possible. @rly [#1166](https://github.com/hdmf-dev/hdmf/pull/1166)
+- Improved "already exists" error message when adding a container to a `MultiContainerInterface`. @rly [#1165](https://github.com/hdmf-dev/hdmf/pull/1165)
+- Added support to write multidimensional string arrays. @stephprince [#1173](https://github.com/hdmf-dev/hdmf/pull/1173)
+- Add support for appending to a dataset of references. @mavaylon1 [#1135](https://github.com/hdmf-dev/hdmf/pull/1135)
+
+### Bug fixes
+- Fixed issue where scalar datasets with a compound data type were being written as non-scalar datasets @stephprince [#1176](https://github.com/hdmf-dev/hdmf/pull/1176)
+- Fixed H5DataIO not exposing `maxshape` on non-dci dsets. @cboulay [#1149](https://github.com/hdmf-dev/hdmf/pull/1149)
+- Fixed generation of classes in an extension that contain attributes or datasets storing references to other types defined in the extension.
+  @rly [#1183](https://github.com/hdmf-dev/hdmf/pull/1183)
+
+## HDMF 3.14.3 (July 29, 2024)
+
+### Enhancements
+- Added new attribute "dimension_labels" on `DatasetBuilder` which specifies the names of the dimensions used in the
+dataset based on the shape of the dataset data and the dimension names in the spec for the data type. This attribute
+is available on build (during the write process), but not on read of a dataset from a file. @rly [#1081](https://github.com/hdmf-dev/hdmf/pull/1081)
+- Speed up loading namespaces by skipping register_type when already registered. @magland [#1102](https://github.com/hdmf-dev/hdmf/pull/1102)
+- Speed up namespace loading: return a shallow copy rather than a deep copy in build_const_args. @magland [#1103](https://github.com/hdmf-dev/hdmf/pull/1103)
+
+## HDMF 3.14.2 (July 7, 2024)
 
 ### Enhancements
 - Warn when unexpected keys are present in specs. @rly [#1134](https://github.com/hdmf-dev/hdmf/pull/1134)
 - Support appending to zarr arrays. @mavaylon1 [#1136](https://github.com/hdmf-dev/hdmf/pull/1136)
+- Support specifying "value" key in DatasetSpec. @rly [#1143](https://github.com/hdmf-dev/hdmf/pull/1143)
 - Add support for numpy 2. @rly [#1139](https://github.com/hdmf-dev/hdmf/pull/1139)
 
 ### Bug fixes

diff --git a/docs/source/install_developers.rst b/docs/source/install_developers.rst
@@ -73,7 +73,7 @@ environment by using the ``conda remove --name hdmf-venv --all`` command.
     For advanced users, we recommend using Mambaforge_, a faster version of the conda package manager
     that includes conda-forge as a default channel.
 
-.. _Anaconda: https://www.anaconda.com/products/distribution
+.. _Anaconda: https://www.anaconda.com/download
 .. _Mambaforge: https://github.com/conda-forge/miniforge
 
 Install from GitHub

diff --git a/docs/source/install_users.rst b/docs/source/install_users.rst
@@ -29,4 +29,4 @@ You can also install HDMF using ``conda`` by running the following command in a
 
    conda install -c conda-forge hdmf
 
-.. _Anaconda Distribution: https://www.anaconda.com/products/distribution
+.. _Anaconda Distribution: https://www.anaconda.com/download
diff --git a/src/hdmf/backends/hdf5/h5_utils.py b/src/hdmf/backends/hdf5/h5_utils.py
@@ -17,11 +17,11 @@
 import logging
 
 from ...array import Array
-from ...data_utils import DataIO, AbstractDataChunkIterator
+from ...data_utils import DataIO, AbstractDataChunkIterator, append_data
 from ...query import HDMFDataset, ReferenceResolver, ContainerResolver, BuilderResolver
 from ...region import RegionSlicer
 from ...spec import SpecWriter, SpecReader
-from ...utils import docval, getargs, popargs, get_docval
+from ...utils import docval, getargs, popargs, get_docval, get_data_shape
 
 
 class HDF5IODataChunkIteratorQueue(deque):
@@ -108,6 +108,20 @@ def ref(self):
     def shape(self):
         return self.dataset.shape
 
+    def append(self, arg):
+        # Get Builder
+        builder = self.io.manager.get_builder(arg)
+        if builder is None:
+            raise ValueError(
+                "The container being appended to the dataset has not yet been built. "
+                "Please write the container to the file, then open the modified file, and "
+                "append the read container to the dataset."
+            )
+
+        # Get HDF5 Reference
+        ref = self.io._create_ref(builder)
+        append_data(self.dataset, ref)
+
 
 class DatasetOfReferences(H5Dataset, ReferenceResolver, metaclass=ABCMeta):
     """
@@ -501,7 +515,7 @@ def __init__(self, **kwargs):
         # Check for possible collision with other parameters
         if not isinstance(getargs('data', kwargs), Dataset) and self.__link_data:
             self.__link_data = False
-            warnings.warn('link_data parameter in H5DataIO will be ignored', stacklevel=2)
+            warnings.warn('link_data parameter in H5DataIO will be ignored', stacklevel=3)
         # Call the super constructor and consume the data parameter
         super().__init__(**kwargs)
         # Construct the dict with the io args, ignoring all options that were set to None
@@ -525,7 +539,7 @@ def __init__(self, **kwargs):
                 self.__iosettings.pop('compression', None)
                 if 'compression_opts' in self.__iosettings:
                     warnings.warn('Compression disabled by compression=False setting. ' +
-                                  'compression_opts parameter will, therefore, be ignored.', stacklevel=2)
+                                  'compression_opts parameter will, therefore, be ignored.', stacklevel=3)
                     self.__iosettings.pop('compression_opts', None)
         # Validate the compression options used
         self._check_compression_options()
@@ -540,16 +554,37 @@ def __init__(self, **kwargs):
         if isinstance(self.data, Dataset):
             for k in self.__iosettings.keys():
                 warnings.warn("%s in H5DataIO will be ignored with H5DataIO.data being an HDF5 dataset" % k,
-                              stacklevel=2)
+                              stacklevel=3)
 
         self.__dataset = None
 
     @property
     def dataset(self):
+        """Get the cached h5py.Dataset."""
         return self.__dataset
 
     @dataset.setter
     def dataset(self, val):
+        """Cache the h5py.Dataset written with the stored IO settings.
+
+        This attribute can be used to cache a written, empty dataset and fill it in later.
+        This allows users to access the handle to the dataset *without* having to close
+        and reopen a file.
+
+        For example::
+
+            dataio = H5DataIO(shape=(5,), dtype=int)
+            foo = Foo('foo1', dataio, "I am foo1", 17, 3.14)
+            bucket = FooBucket('bucket1', [foo])
+            foofile = FooFile(buckets=[bucket])
+
+            io = HDF5IO(self.path, manager=self.manager, mode='w')
+            # write the object to disk, including initializing an empty int dataset with shape (5,)
+            io.write(foofile)
+
+            foo.my_data.dataset[:] = [0, 1, 2, 3, 4]
+            io.close()
+        """
         if self.__dataset is not None:
             raise ValueError("Cannot overwrite H5DataIO.dataset")
         self.__dataset = val
@@ -597,7 +632,7 @@ def _check_compression_options(self):
             if self.__iosettings['compression'] not in ['gzip', h5py_filters.h5z.FILTER_DEFLATE]:
                 warnings.warn(str(self.__iosettings['compression']) + " compression may not be available "
                               "on all installations of HDF5. Use of gzip is recommended to ensure portability of "
-                              "the generated HDF5 files.", stacklevel=3)
+                              "the generated HDF5 files.", stacklevel=4)
 
     @staticmethod
     def filter_available(filter, allow_plugin_filters):
@@ -637,3 +672,14 @@ def valid(self):
         if isinstance(self.data, Dataset) and not self.data.id.valid:
             return False
         return super().valid
+
+    @property
+    def maxshape(self):
+        if 'maxshape' in self.io_settings:
+            return self.io_settings['maxshape']
+        elif hasattr(self.data, 'maxshape'):
+            return self.data.maxshape
+        elif hasattr(self, "shape"):
+            return self.shape
+        else:
+            return get_data_shape(self.data)
diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py
@@ -344,7 +344,7 @@ def copy_file(self, **kwargs):
         warnings.warn("The copy_file class method is no longer supported and may be removed in a future version of "
                       "HDMF. Please use the export method or h5py.File.copy method instead.",
                       category=DeprecationWarning,
-                      stacklevel=2)
+                      stacklevel=3)
 
         source_filename, dest_filename, expand_external, expand_refs, expand_soft = getargs('source_filename',
                                                                                             'dest_filename',
@@ -698,6 +698,8 @@ def __read_dataset(self, h5obj, name=None):
                     d = ReferenceBuilder(target_builder)
                 kwargs['data'] = d
                 kwargs['dtype'] = d.dtype
+            elif h5obj.dtype.kind == 'V':  # scalar compound data type
+                kwargs['data'] = np.array(scalar, dtype=h5obj.dtype)
             else:
                 kwargs["data"] = scalar
         else:
@@ -1227,6 +1229,8 @@ def _filler():
 
                 return
             # If the compound data type contains only regular data (i.e., no references) then we can write it as usual
+            elif len(np.shape(data)) == 0:
+                dset = self.__scalar_fill__(parent, name, data, options)
             else:
                 dset = self.__list_fill__(parent, name, data, options)
         # Write a dataset containing references, i.e., a region or object reference.
@@ -1469,7 +1473,7 @@ def __list_fill__(cls, parent, name, data, options=None):
             data_shape = io_settings.pop('shape')
         elif hasattr(data, 'shape'):
             data_shape = data.shape
-        elif isinstance(dtype, np.dtype):
+        elif isinstance(dtype, np.dtype) and len(dtype) > 1:  # check if compound dtype
             data_shape = (len(data),)
         else:
             data_shape = get_data_shape(data)
@@ -1514,6 +1518,7 @@ def __get_ref(self, **kwargs):
             self.logger.debug("Getting reference for %s '%s'" % (container.__class__.__name__, container.name))
             builder = self.manager.build(container)
         path = self.__get_path(builder)
+
         self.logger.debug("Getting reference at path '%s'" % path)
         if isinstance(container, RegionBuilder):
             region = container.region
@@ -1525,6 +1530,14 @@ def __get_ref(self, **kwargs):
         else:
             return self.__file[path].ref
 
+    @docval({'name': 'container', 'type': (Builder, Container, ReferenceBuilder), 'doc': 'the object to reference',
+             'default': None},
+            {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the region reference indexing object',
+             'default': None},
+            returns='the reference', rtype=Reference)
+    def _create_ref(self, **kwargs):
+        return self.__get_ref(**kwargs)
+
     def __is_ref(self, dtype):
         if isinstance(dtype, DtypeSpec):
             return self.__is_ref(dtype.dtype)

diff --git a/src/hdmf/build/builders.py b/src/hdmf/build/builders.py
@@ -330,18 +330,25 @@ class DatasetBuilder(BaseBuilder):
              'doc': 'The datatype of this dataset.', 'default': None},
             {'name': 'attributes', 'type': dict,
              'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()},
+            {'name': 'dimension_labels', 'type': tuple,
+             'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is '
+                     'supplied only on build.'),
+             'default': None},
             {'name': 'maxshape', 'type': (int, tuple),
              'doc': 'The shape of this dataset. Use None for scalars.', 'default': None},
             {'name': 'chunks', 'type': bool, 'doc': 'Whether or not to chunk this dataset.', 'default': False},
             {'name': 'parent', 'type': GroupBuilder, 'doc': 'The parent builder of this builder.', 'default': None},
             {'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None})
     def __init__(self, **kwargs):
         """ Create a Builder object for a dataset """
-        name, data, dtype, attributes, maxshape, chunks, parent, source = getargs(
-            'name', 'data', 'dtype', 'attributes', 'maxshape', 'chunks', 'parent', 'source', kwargs)
+        name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs(
+            'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source',
+            kwargs
+        )
         super().__init__(name, attributes, parent, source)
         self['data'] = data
         self['attributes'] = _copy.copy(attributes)
+        self.__dimension_labels = dimension_labels
         self.__chunks = chunks
         self.__maxshape = maxshape
         if isinstance(data, BaseBuilder):
@@ -361,6 +368,11 @@ def data(self, val):
             raise AttributeError("Cannot overwrite data.")
         self['data'] = val
 
+    @property
+    def dimension_labels(self):
+        """Labels for each dimension of this dataset from the spec."""
+        return self.__dimension_labels
+
     @property
     def chunks(self):
         """Whether or not this dataset is chunked."""

diff --git a/src/hdmf/build/manager.py b/src/hdmf/build/manager.py
@@ -7,7 +7,7 @@
 from .classgenerator import ClassGenerator, CustomClassGenerator, MCIClassGenerator
 from ..container import AbstractContainer, Container, Data
 from ..term_set import TypeConfigurator
-from ..spec import DatasetSpec, GroupSpec, NamespaceCatalog
+from ..spec import DatasetSpec, GroupSpec, NamespaceCatalog, RefSpec
 from ..spec.spec import BaseStorageSpec
 from ..utils import docval, getargs, ExtenderMeta, get_docval
 
@@ -480,6 +480,7 @@ def load_namespaces(self, **kwargs):
         load_namespaces here has the advantage of being able to keep track of type dependencies across namespaces.
         '''
         deps = self.__ns_catalog.load_namespaces(**kwargs)
+        # register container types for each dependent type in each dependent namespace
         for new_ns, ns_deps in deps.items():
             for src_ns, types in ns_deps.items():
                 for dt in types:
@@ -529,7 +530,7 @@ def get_dt_container_cls(self, **kwargs):
                     namespace = ns_key
                     break
         if namespace is None:
-            raise ValueError("Namespace could not be resolved.")
+            raise ValueError(f"Namespace could not be resolved for data type '{data_type}'.")
 
         cls = self.__get_container_cls(namespace, data_type)
 
@@ -549,6 +550,8 @@ def get_dt_container_cls(self, **kwargs):
 
     def __check_dependent_types(self, spec, namespace):
         """Ensure that classes for all types used by this type exist in this namespace and generate them if not.
+
+        `spec` should be a GroupSpec or DatasetSpec in the `namespace`
         """
         def __check_dependent_types_helper(spec, namespace):
             if isinstance(spec, (GroupSpec, DatasetSpec)):
@@ -564,6 +567,16 @@ def __check_dependent_types_helper(spec, namespace):
 
         if spec.data_type_inc is not None:
             self.get_dt_container_cls(spec.data_type_inc, namespace)
+
+        # handle attributes that have a reference dtype
+        for attr_spec in spec.attributes:
+            if isinstance(attr_spec.dtype, RefSpec):
+                self.get_dt_container_cls(attr_spec.dtype.target_type, namespace)
+        # handle datasets that have a reference dtype
+        if isinstance(spec, DatasetSpec):
+            if isinstance(spec.dtype, RefSpec):
+                self.get_dt_container_cls(spec.dtype.target_type, namespace)
+        # recurse into nested types
         if isinstance(spec, GroupSpec):
             for child_spec in (spec.groups + spec.datasets + spec.links):
                 __check_dependent_types_helper(child_spec, namespace)