ome · joshmoore · Mar 2, 2022 · Mar 3, 2022 · Mar 3, 2022 · Mar 3, 2022
diff --git a/.github/workflows/zarr-dev.yml b/.github/workflows/zarr-dev.yml
@@ -38,7 +38,7 @@ jobs:
       - name: Install dependencies
         shell: bash -l {0}
         run: |
-          python -m pip install --upgrade pip wheel pytest tox
+          python -m pip install -r requirements/requirements-dev.txt
           python -m pip install \
           git+https://github.com/zarr-developers/zarr-python.git@master
 

diff --git a/.isort.cfg b/.isort.cfg
@@ -1,5 +1,5 @@
 [settings]
-known_third_party = dask,numcodecs,numpy,pytest,scipy,setuptools,skimage,zarr
+known_third_party = dask,entrypoints,numcodecs,numpy,ome_types,ome_zarr_metadata,pytest,scipy,setuptools,skimage,zarr
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0

diff --git a/ome_zarr/bioformats2raw.py b/ome_zarr/bioformats2raw.py
@@ -0,0 +1,99 @@
+"""
+Spec definitions
+"""
+
+import logging
+import os
+import re
+import tempfile
+from xml.etree import ElementTree as ET
+
+import ome_types
+from ome_zarr_metadata import __version__  # noqa
+
+from ome_zarr.io import ZarrLocation
+from ome_zarr.reader import Node
+from ome_zarr.reader import Spec as Base
+
+__author__ = "Open Microscopy Environment (OME)"
+__copyright__ = "Open Microscopy Environment (OME)"
+__license__ = "BSD-2-Clause"
+
+_logger = logging.getLogger(__name__)
+
+
+class bioformats2raw(Base):
+    @staticmethod
+    def matches(zarr: ZarrLocation) -> bool:
+        layout = zarr.root_attrs.get("bioformats2raw.layout", None)
+        return layout == 3
+
+    def __init__(self, node: Node) -> None:
+        super().__init__(node)
+        try:
+            data = self.handle(node)
+            if data.plates:
+                _logger.info("Plates detected. Skipping implicit loading")
+            else:
+                for idx, image in enumerate(data.images):
+                    series = node.zarr.create(str(idx))
+                    assert series.exists(), f"{series} is missing"
+                    _logger.info("found %s", series)
+                    subnode = node.add(series)
+                    if subnode:
+                        subnode.metadata["ome-xml:index"] = idx
+                        subnode.metadata["ome-xml:image"] = image
+            node.metadata["ome-xml"] = data
+        except Exception:
+            _logger.exception("failed to parse metadata")
+
+    def fix_xml(self, ns: str, elem: ET.Element) -> None:
+        """
+        Note: elem.insert() was not updating the object correctly.
+        """
+
+        if elem.tag == f"{ns}Pixels":
+
+            must_have = {f"{ns}BinData", f"{ns}TiffData", f"{ns}MetadataOnly"}
+            children = {x.tag for x in elem}
+
+            if not any(x in children for x in must_have):
+                # Needs fixing
+                metadata_only = ET.Element(f"{ns}MetadataOnly")
+
+                last_channel = -1
+                for idx, child in enumerate(elem):
+                    if child.tag == f"{ns}Channel":
+                        last_channel = idx
+                elem.insert(last_channel + 1, metadata_only)
+
+        elif elem.tag == f"{ns}Plane":
+            remove = None
+            for idx, child in enumerate(elem):
+                if child.tag == f"{ns}HashSHA1":
+                    remove = child
+            if remove:
+                elem.remove(remove)
+
+    def parse_xml(self, filename: str) -> ome_types.model.OME:
+        # Parse the file and find the current schema
+        root = ET.parse(filename)
+        m = re.match(r"\{.*\}", root.getroot().tag)
+        ns = m.group(0) if m else ""
+
+        # Update the XML to include MetadataOnly
+        for child in list(root.iter()):
+            self.fix_xml(ns, child)
+        fixed = ET.tostring(root.getroot()).decode()
+
+        # Write file out for ome_types
+        with tempfile.NamedTemporaryFile() as t:
+            t.write(fixed.encode())
+            t.flush()
+            return ome_types.from_xml(t.name)
+
+    def handle(self, node: Node) -> ome_types.model.OME:
+        metadata = node.zarr.subpath("OME/METADATA.ome.xml")
+        _logger.info("Looking for metadata in %s", metadata)
+        if os.path.exists(metadata):
+            return self.parse_xml(metadata)
diff --git a/ome_zarr/reader.py b/ome_zarr/reader.py
@@ -1,12 +1,21 @@
-"""Reading logic for ome-zarr."""
+"""Reading logic for ome-zarr.
+
+The main class (Reader) is initialitzed with an [ome_zarr.io.ZarrLocation]
+as returned by [ome_zarr.io.parse_url] and walks up and down the Zarr
+hierarchy parsing each array or group into a [Node] which is aware of all
+meta(data) specifications ([Spec] class) which are available in the current
+runtime.
+"""
 
 import logging
 import math
 from abc import ABC
 from typing import Any, Dict, Iterator, List, Optional, Type, Union, cast, overload
 
 import dask.array as da
+import entrypoints
 import numpy as np
+import zarr
 from dask import delayed
 
 from .axes import Axes
@@ -45,21 +54,49 @@ def __init__(
         self.post_nodes: List[Node] = []
 
         # TODO: this should be some form of plugin infra over subclasses
+        found: List[Spec] = []
         if Labels.matches(zarr):
-            self.specs.append(Labels(self))
+            found.append(Labels(self))
+            self.specs.append(found[-1])
         if Label.matches(zarr):
-            self.specs.append(Label(self))
+            found.append(Label(self))
+            self.specs.append(found[-1])
         if Multiscales.matches(zarr):
-            self.specs.append(Multiscales(self))
+            found.append(Multiscales(self))
+            self.specs.append(found[-1])
         if OMERO.matches(zarr):
-            self.specs.append(OMERO(self))
+            found.append(OMERO(self))
+            self.specs.append(found[-1])
         if plate_labels:
-            self.specs.append(PlateLabels(self))
+            found.append(PlateLabels(self))
+            self.specs.append(found[-1])
         elif Plate.matches(zarr):
-            self.specs.append(Plate(self))
+            found.append(Plate(self))
+            self.specs.append(found[-1])
             # self.add(zarr, plate_labels=True)
         if Well.matches(zarr):
-            self.specs.append(Well(self))
+            found.append(Well(self))
+            self.specs.append(found[-1])
+
+        # Load all entrypoints and give them a chance
+        # to claim parse the current node.
+        for key, value in entrypoints.get_group_named("ome_zarr.spec").items():
+            cls = value.load()
+            if cls.matches(zarr):
+                found.append(cls(self))
+                self.specs.append(found[-1])
+
+        # Anything that has not received a type at this point
+        # can be considered an implicit group.
+        if not found:
+            self.specs.append(Implicit(self))
+
+        if False:  # Temporarily disable. See #174
+            # Load up the hierarchy
+            if Leaf.matches(zarr):
+                self.specs.append(Leaf(self))
+            else:
+                self.specs.append(Root(self))
 
     @overload
     def first(self, spectype: Type["Well"]) -> Optional["Well"]:
@@ -178,6 +215,60 @@ def lookup(self, key: str, default: Any) -> Any:
         return self.zarr.root_attrs.get(key, default)
 
 
+class Implicit(Spec):
+    """
+    A spec-type which simply iterates over available zgroups.
+    """
+
+    @staticmethod
+    def matches(zarr: ZarrLocation) -> bool:
+        """Always return true"""
+        return True
+
+    def __init__(self, node: Node) -> None:
+        super().__init__(node)
+
+        for name in zarr.group(self.zarr.store).group_keys():
+            child_zarr = self.zarr.create(name)
+            if child_zarr.exists():
+                node.add(child_zarr)
+
+
+class Leaf(Spec):
+    """
+    A non-root level of the Zarr hierarchy
+    """
+
+    @staticmethod
+    def matches(zarr: ZarrLocation) -> bool:
+        """Return if the parent directory is within the zarr fileset"""
+
+        parent_zarr = zarr.create("..")
+        return bool(parent_zarr.exists() and (parent_zarr.zgroup or parent_zarr.zarray))
+
+    def __init__(self, node: Node) -> None:
+        super().__init__(node)
+        parent_zarr = node.zarr.create("..")
+        if parent_zarr.exists() and (parent_zarr.zgroup or parent_zarr.zarray):
+            node.add(parent_zarr)
+
+
+class Root(Spec):
+    """
+    Root of the Zarr fileset
+    """
+
+    @staticmethod
+    def matches(zarr: ZarrLocation) -> bool:
+        """Return if the parent directory is not within the zarr fileset"""
+
+        parent_zarr = zarr.create("..")
+        return parent_zarr.exists() and not (parent_zarr.zgroup or parent_zarr.zarray)
+
+    def __init__(self, node: Node) -> None:
+        super().__init__(node)
+
+
 class Labels(Spec):
     """Relatively small specification for the well-known "labels" group which only
     contains the name of subgroups which should be loaded as labeled images."""

diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
@@ -1,6 +1,7 @@
 black
 cython >= 0.29.16
 numpy >= 1.16.0
+entrypoints
 pre-commit
 tox
 wheel

diff --git a/setup.py b/setup.py
@@ -24,6 +24,8 @@ def read(fname):
 install_requires += (["requests"],)
 install_requires += (["scikit-image"],)
 install_requires += (["toolz"],)
+install_requires += (["entrypoints"],)
+install_requires += (["ome-types"],)
 
 
 setup(
@@ -49,6 +51,7 @@ def read(fname):
     ],
     entry_points={
         "console_scripts": ["ome_zarr = ome_zarr.cli:main"],
+        "ome_zarr.spec": ["bioformats2raw = ome_zarr.bioformats2raw:bioformats2raw"],
     },
     tests_require=["pytest"],
 )