Skip to content

Commit

Permalink
Fix opening tiff and fits files (#162)
Browse files Browse the repository at this point in the history
* make reader_options default None instead of a mutable dict

* wrap the refs up in the top-level dict for consistency

* deal with another inconsistency in kerchunks representation of compressors

* remove comment

* same for fits

* ensure reader_options is not None

* add tiff file to list of files to test opening

* don't create indexes when opening files

* parametrize the test over different filetypes

* dependency for opening tiff files

* only test opening tiff files if tifffile package is available

* opens FITS files too

* lint

* release notes

* minor docstring improvements

* remove question mark for FITS
  • Loading branch information
TomNicholas committed Jun 28, 2024
1 parent 10e7863 commit 76fbb9c
Show file tree
Hide file tree
Showing 9 changed files with 92 additions and 39 deletions.
4 changes: 4 additions & 0 deletions ci/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ dependencies:
- fsspec
- s3fs
- fastparquet
# for opening tiff files
- tifffile
# for opening FITS files
- astropy
2 changes: 1 addition & 1 deletion docs/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Users of kerchunk may find the following comparison table useful, which shows wh
| From a COG / tiff file | `kerchunk.tiff.tiff_to_zarr` | `open_virtual_dataset`, via `kerchunk.tiff.tiff_to_zarr` or potentially `cog3pio` |
| From a Zarr v2 store | `kerchunk.zarr.ZarrToZarr` | `open_virtual_dataset`, via `kerchunk.zarr.ZarrToZarr` ? |
| From a GRIB2 file | `kerchunk.grib2.scan_grib` | `open_virtual_datatree`, via `kerchunk.grib2.scan_grib` ? |
| From a FITS file | `kerchunk.fits.process_file` | `open_virtual_dataset`, via `kerchunk.fits.process_file` ? |
| From a FITS file | `kerchunk.fits.process_file` | `open_virtual_dataset`, via `kerchunk.fits.process_file` |
| **In-memory representation (2)** | | |
| In-memory representation of byte ranges for single array | Part of a "reference `dict`" with keys for each chunk in array | `ManifestArray` instance (wrapping a `ChunkManifest` instance) |
| In-memory representation of actual data values | Encoded bytes directly serialized into the "reference `dict`", created on a per-chunk basis using the `inline_threshold` kwarg | `numpy.ndarray` instances, created on a per-variable basis using the `loadable_variables` kwarg |
Expand Down
4 changes: 3 additions & 1 deletion docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ v0.2 (unreleased)
New Features
~~~~~~~~~~~~

- Now successfully opens both tiff and FITS files. (:issue:`160`, :pull:`162`)
By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Added a `.rename_paths` convenience method to rename paths in a manifest according to a function.
(:pull:`152`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- New ``cftime_variables`` option on ``open_virtual_dataset`` enables encoding/decoding time.
Expand Down Expand Up @@ -45,7 +47,7 @@ Internal Changes
(:pull:`107`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest.
(:pull:`144`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Determine file format from magic bytes rather than name suffix
- Determine file format from magic bytes rather than name suffix
(:pull:`143`) By `Scott Henderson <https://github.com/scottyhq>`_.

.. _v0.1:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,4 @@ known-first-party = ["virtualizarr"]
[tool.pytest.ini_options]
markers = [
"network: marks test requiring internet (select with '--run-network-tests')",
]
]
13 changes: 11 additions & 2 deletions virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import base64
import json
import warnings
from enum import Enum, auto
from pathlib import Path
from typing import Any, NewType, Optional, cast
Expand Down Expand Up @@ -104,11 +105,19 @@ def read_kerchunk_references_from_file(
elif filetype.name.lower() == "tiff":
from kerchunk.tiff import tiff_to_zarr

refs = tiff_to_zarr(filepath, **reader_options)
reader_options.pop("storage_options", {})
warnings.warn(
"storage_options have been dropped from reader_options as they are not supported by kerchunk.tiff.tiff_to_zarr",
UserWarning,
)

# handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
refs = {"refs": tiff_to_zarr(filepath, **reader_options)}
elif filetype.name.lower() == "fits":
from kerchunk.fits import process_file

refs = process_file(filepath, **reader_options)
# handle inconsistency in kerchunk, see GH issue https://github.com/zarr-developers/VirtualiZarr/issues/160
refs = {"refs": process_file(filepath, **reader_options)}
else:
raise NotImplementedError(f"Unsupported file type: {filetype.name}")

Expand Down
2 changes: 2 additions & 0 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def _importorskip(
return has, func


has_astropy, requires_astropy = _importorskip("astropy")
has_s3fs, requires_s3fs = _importorskip("s3fs")
has_tifffile, requires_tifffile = _importorskip("tifffile")


def create_manifestarray(
Expand Down
69 changes: 48 additions & 21 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.tests import network, requires_s3fs
from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs
from virtualizarr.zarr import ZArray


Expand Down Expand Up @@ -310,27 +310,54 @@ def test_anon_read_s3(self, filetype, indexes):

@network
class TestReadFromURL:
def test_read_from_url(self):
examples = {
"grib": "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib",
"netcdf3": "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc",
"netcdf4": "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc",
"hdf4": "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
@pytest.mark.parametrize(
"filetype, url",
[
(
"grib",
"https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib",
),
(
"netcdf3",
"https://github.com/pydata/xarray-data/raw/master/air_temperature.nc",
),
(
"netcdf4",
"https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc",
),
(
"hdf4",
"https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
),
# https://github.com/zarr-developers/VirtualiZarr/issues/159
# "hdf5": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5",
# https://github.com/zarr-developers/VirtualiZarr/issues/160
# "tiff": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif",
# "fits": "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits",
"jpg": "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg",
}

for filetype, url in examples.items():
if filetype in ["grib", "jpg", "hdf4"]:
with pytest.raises(NotImplementedError):
vds = open_virtual_dataset(url, reader_options={})
else:
vds = open_virtual_dataset(url, reader_options={})
assert isinstance(vds, xr.Dataset)
# ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"),
pytest.param(
"tiff",
"https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif",
marks=pytest.mark.skipif(
not has_tifffile, reason="package tifffile is not available"
),
),
pytest.param(
"fits",
"https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits",
marks=pytest.mark.skipif(
not has_astropy, reason="package astropy is not available"
),
),
(
"jpg",
"https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg",
),
],
)
def test_read_from_url(self, filetype, url):
if filetype in ["grib", "jpg", "hdf4"]:
with pytest.raises(NotImplementedError):
vds = open_virtual_dataset(url, reader_options={}, indexes={})
else:
vds = open_virtual_dataset(url, reader_options={}, indexes={})
assert isinstance(vds, xr.Dataset)


class TestLoadVirtualDataset:
Expand Down
27 changes: 15 additions & 12 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,14 @@ def open_virtual_dataset(
cftime_variables: Iterable[str] | None = None,
indexes: Mapping[str, Index] | None = None,
virtual_array_class=ManifestArray,
reader_options: Optional[dict] = {
"storage_options": {"key": "", "secret": "", "anon": True}
},
reader_options: Optional[dict] = None,
) -> xr.Dataset:
"""
Open a file or store as an xarray Dataset wrapping virtualized zarr arrays.
No data variables will be loaded.
No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will be xarray lazily indexed arrays).
Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass indexes={}.
Xarray indexes can optionally be created (the default behaviour). To avoid creating any xarray indexes pass ``indexes={}``.
Parameters
----------
Expand All @@ -65,20 +63,20 @@ def open_virtual_dataset(
loadable_variables: list[str], default is None
Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class.
Default is to open all variables as virtual arrays (i.e. ManifestArray).
cftime_variables : list[str], default is None
Interpret the value of specified vars using cftime, returning a datetime.
These will be automatically re-encoded with cftime. This list must be a subset
of ``loadable_variables``.
indexes : Mapping[str, Index], default is None
Indexes to use on the returned xarray Dataset.
Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes.
To avoid creating any indexes, pass indexes={}.
virtual_array_class
Virtual array class to use to represent the references to the chunks in each on-disk array.
Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that.
cftime_variables : list[str], default is None
Interpret the value of specified vars using cftime, returning a datetime.
These will be automatically re-encoded with cftime. This list must be a subset
of ``loadable_variables``.
reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}}
Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments,
so ensure reader_options match selected Kerchunk reader arguments.
reader_options: dict, default {'storage_options': {'key': '', 'secret': '', 'anon': True}}
Dict passed into Kerchunk file readers, to allow reading from remote filesystems.
Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments.
Returns
-------
Expand Down Expand Up @@ -125,6 +123,11 @@ def open_virtual_dataset(
storepath=filepath, drop_variables=drop_variables, indexes=indexes
)
else:
if reader_options is None:
reader_options = {
"storage_options": {"key": "", "secret": "", "anon": True}
}

# this is the only place we actually always need to use kerchunk directly
# TODO avoid even reading byte ranges for variables that will be dropped later anyway?
vds_refs = kerchunk.read_kerchunk_references_from_file(
Expand Down
8 changes: 7 additions & 1 deletion virtualizarr/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,15 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
if fill_value is None or fill_value == "NaN" or fill_value == "nan":
fill_value = np.nan

compressor = decoded_arr_refs_zarray["compressor"]
# deal with an inconsistency in kerchunk's tiff_to_zarr function
# TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream.
if compressor is not None and "id" in compressor:
compressor = compressor["id"]

return ZArray(
chunks=tuple(decoded_arr_refs_zarray["chunks"]),
compressor=decoded_arr_refs_zarray["compressor"],
compressor=compressor,
dtype=np.dtype(decoded_arr_refs_zarray["dtype"]),
fill_value=fill_value,
filters=decoded_arr_refs_zarray["filters"],
Expand Down

0 comments on commit 76fbb9c

Please sign in to comment.