Fix opening tiff and fits files (#162)

* make reader_options default None instead of a mutable dict * wrap the refs up in the top-level dict for consistency * deal with another inconsistency in kerchunks representation of compressors * remove comment * same for fits * ensure reader_options is not None * add tiff file to list of files to test opening * don't create indexes when opening files * parametrize the test over different filetypes * dependency for opening tiff files * only test opening tiff files if tifffile package is available * opens FITS files too * lint * release notes * minor docstring improvements * remove question mark for FITS
zarr-developers · Jun 28, 2024 · 76fbb9c · 76fbb9c
1 parent 10e7863
commit 76fbb9c
Show file tree

Hide file tree

Showing 9 changed files with 92 additions and 39 deletions.
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -26,3 +26,7 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
+  # for opening tiff files
+  - tifffile
+  # for opening FITS files
+  - astropy
diff --git a/docs/faq.md b/docs/faq.md
@@ -25,7 +25,7 @@ Users of kerchunk may find the following comparison table useful, which shows wh
 | From a COG / tiff file                                                   | `kerchunk.tiff.tiff_to_zarr`                                                                                                        | `open_virtual_dataset`, via `kerchunk.tiff.tiff_to_zarr` or potentially `cog3pio`                                                                |
 | From a Zarr v2 store                                                     | `kerchunk.zarr.ZarrToZarr`                                                                                                          | `open_virtual_dataset`, via `kerchunk.zarr.ZarrToZarr` ?                                                                                        |
 | From a GRIB2 file                                                        | `kerchunk.grib2.scan_grib`                                                                                                          | `open_virtual_datatree`, via `kerchunk.grib2.scan_grib` ?                                                                                        |
-| From a FITS file                                                         | `kerchunk.fits.process_file`                                                                                                        | `open_virtual_dataset`, via `kerchunk.fits.process_file` ?                                                                                       |
+| From a FITS file                                                         | `kerchunk.fits.process_file`                                                                                                        | `open_virtual_dataset`, via `kerchunk.fits.process_file`                                                                                      |
 | **In-memory representation (2)**                                         |                                                                                                                                     |                                                                                                                                                  |
 | In-memory representation of byte ranges for single array                 | Part of a "reference `dict`" with keys for each chunk in array                                                                      | `ManifestArray` instance (wrapping a `ChunkManifest` instance)                                                                                   |
 | In-memory representation of actual data values                           | Encoded bytes directly serialized into the "reference `dict`", created on a per-chunk basis using the `inline_threshold` kwarg      | `numpy.ndarray` instances, created on a per-variable basis using the `loadable_variables` kwarg                                                  |

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -9,6 +9,8 @@ v0.2 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- Now successfully opens both tiff and FITS files. (:issue:`160`, :pull:`162`)
+  By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Added a `.rename_paths` convenience method to rename paths in a manifest according to a function.
   (:pull:`152`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - New ``cftime_variables`` option on ``open_virtual_dataset`` enables encoding/decoding time.
@@ -45,7 +47,7 @@ Internal Changes
   (:pull:`107`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest.
   (:pull:`144`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
-  - Determine file format from magic bytes rather than name suffix
+- Determine file format from magic bytes rather than name suffix
   (:pull:`143`) By `Scott Henderson <https://github.com/scottyhq>`_.
 
 .. _v0.1:

diff --git a/pyproject.toml b/pyproject.toml
@@ -118,4 +118,4 @@ known-first-party = ["virtualizarr"]
 [tool.pytest.ini_options]
 markers = [
     "network: marks test requiring internet (select with '--run-network-tests')",
-]
+]
diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
@@ -1,5 +1,6 @@
 import base64
 import json
+import warnings
 from enum import Enum, auto
 from pathlib import Path
 from typing import Any, NewType, Optional, cast
@@ -104,11 +105,19 @@ def read_kerchunk_references_from_file(
     elif filetype.name.lower() == "tiff":
         from kerchunk.tiff import tiff_to_zarr
 
-        refs = tiff_to_zarr(filepath, **reader_options)
+        reader_options.pop("storage_options", {})
+        warnings.warn(
+            "storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr",
+            UserWarning,
+        )
+
+        # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
+        refs = {"refs": tiff_to_zarr(filepath, **reader_options)}
     elif filetype.name.lower() == "fits":
         from kerchunk.fits import process_file
 
-        refs = process_file(filepath, **reader_options)
+        # handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
+        refs = {"refs": process_file(filepath, **reader_options)}
     else:
         raise NotImplementedError(f"Unsupported file type: {filetype.name}")
 

diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
@@ -32,7 +32,9 @@ def _importorskip(
     return has, func
 
 
+has_astropy, requires_astropy = _importorskip("astropy")
 has_s3fs, requires_s3fs = _importorskip("s3fs")
+has_tifffile, requires_tifffile = _importorskip("tifffile")
 
 
 def create_manifestarray(

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -9,7 +9,7 @@
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-from virtualizarr.tests import network, requires_s3fs
+from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs
 from virtualizarr.zarr import ZArray
 
 
@@ -310,27 +310,54 @@ def test_anon_read_s3(self, filetype, indexes):
 
 @network
 class TestReadFromURL:
-    def test_read_from_url(self):
-        examples = {
-            "grib": "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib",
-            "netcdf3": "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc",
-            "netcdf4": "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc",
-            "hdf4": "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
+    @pytest.mark.parametrize(
+        "filetype, url",
+        [
+            (
+                "grib",
+                "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib",
+            ),
+            (
+                "netcdf3",
+                "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc",
+            ),
+            (
+                "netcdf4",
+                "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc",
+            ),
+            (
+                "hdf4",
+                "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
+            ),
             # https://github.com/zarr-developers/VirtualiZarr/issues/159
-            # "hdf5": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5",
-            # https://github.com/zarr-developers/VirtualiZarr/issues/160
-            # "tiff": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif",
-            # "fits": "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits",
-            "jpg": "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg",
-        }
-
-        for filetype, url in examples.items():
-            if filetype in ["grib", "jpg", "hdf4"]:
-                with pytest.raises(NotImplementedError):
-                    vds = open_virtual_dataset(url, reader_options={})
-            else:
-                vds = open_virtual_dataset(url, reader_options={})
-                assert isinstance(vds, xr.Dataset)
+            # ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"),
+            pytest.param(
+                "tiff",
+                "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif",
+                marks=pytest.mark.skipif(
+                    not has_tifffile, reason="package tifffile is not available"
+                ),
+            ),
+            pytest.param(
+                "fits",
+                "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits",
+                marks=pytest.mark.skipif(
+                    not has_astropy, reason="package astropy is not available"
+                ),
+            ),
+            (
+                "jpg",
+                "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg",
+            ),
+        ],
+    )
+    def test_read_from_url(self, filetype, url):
+        if filetype in ["grib", "jpg", "hdf4"]:
+            with pytest.raises(NotImplementedError):
+                vds = open_virtual_dataset(url, reader_options={}, indexes={})
+        else:
+            vds = open_virtual_dataset(url, reader_options={}, indexes={})
+            assert isinstance(vds, xr.Dataset)
 
 
 class TestLoadVirtualDataset:

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
@@ -41,16 +41,14 @@ def open_virtual_dataset(
     cftime_variables: Iterable[str] | None = None,
     indexes: Mapping[str, Index] | None = None,
     virtual_array_class=ManifestArray,
-    reader_options: Optional[dict] = {
-        "storage_options": {"key": "", "secret": "", "anon": True}
-    },
+    reader_options: Optional[dict] = None,
 ) -> xr.Dataset:
     """
     Open a file or store as an xarray Dataset wrapping virtualized zarr arrays.
 
-    No data variables will be loaded.
+    No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will be xarray lazily indexed arrays).
 
-    Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass indexes={}.
+    Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``.
 
     Parameters
     ----------
@@ -65,20 +63,20 @@ def open_virtual_dataset(
     loadable_variables: list[str], default is None
         Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class.
         Default is to open all variables as virtual arrays (i.e. ManifestArray).
+    cftime_variables : list[str], default is None
+        Interpret the value of specified vars using cftime, returning a datetime.
+        These will be automatically re-encoded with cftime. This list must be a subset
+        of ``loadable_variables``.
     indexes : Mapping[str, Index], default is None
         Indexes to use on the returned xarray Dataset.
         Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes.
         To avoid creating any indexes, pass indexes={}.
     virtual_array_class
         Virtual array class to use to represent the references to the chunks in each on-disk array.
         Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that.
-    cftime_variables : list[str], default is None
-        Interpret the value of specified vars using cftime, returning a datetime.
-        These will be automatically re-encoded with cftime. This list must be a subset
-        of ``loadable_variables``.
-    reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}}
-        Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments,
-        so ensure reader_options match selected Kerchunk reader arguments.
+    reader_options: dict, default {'storage_options': {'key': '', 'secret': '', 'anon': True}}
+        Dict passed into Kerchunk file readers, to allow reading from remote filesystems.
+        Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments.
 
     Returns
     -------
@@ -125,6 +123,11 @@ def open_virtual_dataset(
             storepath=filepath, drop_variables=drop_variables, indexes=indexes
         )
     else:
+        if reader_options is None:
+            reader_options = {
+                "storage_options": {"key": "", "secret": "", "anon": True}
+            }
+
         # this is the only place we actually always need to use kerchunk directly
         # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
         vds_refs = kerchunk.read_kerchunk_references_from_file(

diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
@@ -79,9 +79,15 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
         if fill_value is None or fill_value == "NaN" or fill_value == "nan":
             fill_value = np.nan
 
+        compressor = decoded_arr_refs_zarray["compressor"]
+        # deal with an inconsistency in kerchunk's tiff_to_zarr function
+        # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream.
+        if compressor is not None and "id" in compressor:
+            compressor = compressor["id"]
+
         return ZArray(
             chunks=tuple(decoded_arr_refs_zarray["chunks"]),
-            compressor=decoded_arr_refs_zarray["compressor"],
+            compressor=compressor,
             dtype=np.dtype(decoded_arr_refs_zarray["dtype"]),
             fill_value=fill_value,
             filters=decoded_arr_refs_zarray["filters"],