diff --git a/docs/contributing.md b/docs/contributing.md index e617db2..4028dca 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,5 +1,16 @@ # Contributing +## Contributing code + +```bash +mamba env create -f ci/environment.yml +mamba activate virtualizarr-tests +pre-commit install +# git checkout -b new-feature +python -m pip install -e . --no-deps +python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose +``` + ## Contributing documentation ### Build the documentation locally diff --git a/docs/releases.rst b/docs/releases.rst index d308814..1594b8c 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -43,6 +43,8 @@ Internal Changes (:pull:`107`) By `Tom Nicholas `_. - Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest. (:pull:`144`) By `Tom Nicholas `_. + - Determine file format from magic bytes rather than name suffix + (:pull:`143`) By `Scott Henderson `_. .. _v0.1: diff --git a/pyproject.toml b/pyproject.toml index 075059c..fd13d39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,3 +114,8 @@ line-ending = "auto" [tool.ruff.lint.isort] known-first-party = ["virtualizarr"] + +[tool.pytest.ini_options] +markers = [ + "network: marks test requiring internet (select with '--run-network-tests')", +] \ No newline at end of file diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index f668369..46b4f2b 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -33,7 +33,9 @@ def _generate_next_value_(name, start, count, last_values): class FileType(AutoName): netcdf3 = auto() - netcdf4 = auto() + netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 + hdf4 = auto() + hdf5 = auto() grib = auto() tiff = auto() fits = auto() @@ -86,7 +88,7 @@ def read_kerchunk_references_from_file( refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() - elif filetype.name.lower() == "netcdf4": + elif filetype.name.lower() == "hdf5" or filetype.name.lower() == "netcdf4": from kerchunk.hdf import SingleHdf5ToZarr refs = SingleHdf5ToZarr( @@ -99,11 +101,11 @@ def read_kerchunk_references_from_file( elif filetype.name.lower() == "tiff": from kerchunk.tiff import tiff_to_zarr - refs = tiff_to_zarr(filepath, inline_threshold=0, **reader_options) + refs = tiff_to_zarr(filepath, **reader_options) elif filetype.name.lower() == "fits": from kerchunk.fits import process_file - refs = process_file(filepath, inline_threshold=0, **reader_options) + refs = process_file(filepath, **reader_options) else: raise NotImplementedError(f"Unsupported file type: {filetype.name}") @@ -116,34 +118,34 @@ def _automatically_determine_filetype( filepath: str, reader_options: Optional[dict[str, Any]] = None, ) -> FileType: - file_extension = Path(filepath).suffix + if Path(filepath).suffix == ".zarr": + # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... + raise NotImplementedError() + + # Read magic bytes from local or remote file fpath = _fsspec_openfile_from_filepath( filepath=filepath, reader_options=reader_options ) + magic_bytes = fpath.read(8) + fpath.close() - if file_extension == ".nc": - # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167 - magic = fpath.read() - - if magic[0:3] == b"CDF": - filetype = FileType.netcdf3 - elif magic[1:4] == b"HDF": - filetype = FileType.netcdf4 - else: - raise ValueError(".nc file does not appear to be NETCDF3 OR NETCDF4") - elif file_extension == ".zarr": - # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... - raise NotImplementedError() - elif file_extension == ".grib": + if magic_bytes.startswith(b"CDF"): + filetype = FileType.netcdf3 + elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): + raise NotImplementedError("HDF4 formatted files not supported") + elif magic_bytes.startswith(b"\x89HDF"): + filetype = FileType.hdf5 + elif magic_bytes.startswith(b"GRIB"): filetype = FileType.grib - elif file_extension == ".tiff": + elif magic_bytes.startswith(b"II*"): filetype = FileType.tiff - elif file_extension == ".fits": + elif magic_bytes.startswith(b"SIMPLE"): filetype = FileType.fits else: - raise NotImplementedError(f"Unrecognised file extension: {file_extension}") + raise NotImplementedError( + f"Unrecognised file based on header bytes: {magic_bytes}" + ) - fpath.close() return filetype diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index a623bc0..22d6d7d 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -223,15 +223,43 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): assert FileType("netcdf3") == _automatically_determine_filetype( filepath=netcdf3_file_path ) - assert FileType("netcdf4") == _automatically_determine_filetype( + assert FileType("hdf5") == _automatically_determine_filetype( filepath=netcdf4_file_path ) +@pytest.mark.parametrize( + "filetype,headerbytes", + [ + ("netcdf3", b"CDF"), + ("hdf5", b"\x89HDF"), + ("grib", b"GRIB"), + ("tiff", b"II*"), + ("fits", b"SIMPLE"), + ], +) +def test_valid_filetype_bytes(tmp_path, filetype, headerbytes): + filepath = tmp_path / "file.abc" + with open(filepath, "wb") as f: + f.write(headerbytes) + assert FileType(filetype) == _automatically_determine_filetype(filepath=filepath) + + +def test_notimplemented_filetype(tmp_path): + for headerbytes in [b"JUNK", b"\x0e\x03\x13\x01"]: + filepath = tmp_path / "file.abc" + with open(filepath, "wb") as f: + f.write(headerbytes) + with pytest.raises(NotImplementedError): + _automatically_determine_filetype(filepath=filepath) + + def test_FileType(): # tests if FileType converts user supplied strings to correct filetype assert "netcdf3" == FileType("netcdf3").name assert "netcdf4" == FileType("netcdf4").name + assert "hdf4" == FileType("hdf4").name + assert "hdf5" == FileType("hdf5").name assert "grib" == FileType("grib").name assert "tiff" == FileType("tiff").name assert "fits" == FileType("fits").name diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index e55583b..5a0f1d3 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -308,6 +308,31 @@ def test_anon_read_s3(self, filetype, indexes): assert isinstance(vds[var].data, ManifestArray), var +@network +class TestReadFromURL: + def test_read_from_url(self): + examples = { + "grib": "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib", + "netcdf3": "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc", + "netcdf4": "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc", + "hdf4": "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", + # https://github.com/zarr-developers/VirtualiZarr/issues/159 + # "hdf5": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5", + # https://github.com/zarr-developers/VirtualiZarr/issues/160 + # "tiff": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", + # "fits": "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits", + "jpg": "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg", + } + + for filetype, url in examples.items(): + if filetype in ["grib", "jpg", "hdf4"]: + with pytest.raises(NotImplementedError): + vds = open_virtual_dataset(url, reader_options={}) + else: + vds = open_virtual_dataset(url, reader_options={}) + assert isinstance(vds, xr.Dataset) + + class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): vars_to_load = ["air", "time"] @@ -325,6 +350,13 @@ def test_loadable_variables(self, netcdf4_file): if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + def test_explicit_filetype(self, netcdf4_file): + with pytest.raises(ValueError): + open_virtual_dataset(netcdf4_file, filetype="unknown") + + with pytest.raises(NotImplementedError): + open_virtual_dataset(netcdf4_file, filetype="grib") + @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") def test_open_virtual_dataset_passes_expected_args( self, mock_read_kerchunk, netcdf4_file diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index d8bf860..be2bf09 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -55,8 +55,8 @@ def open_virtual_dataset( File path to open as a set of virtualized zarr arrays. filetype : FileType, default None Type of file to be opened. Used to determine which kerchunk file format backend to use. - Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}. - If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. + Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}. + If not provided will attempt to automatically infer the correct filetype from header bytes. drop_variables: list[str], default is None Variables in the file to drop before returning. loadable_variables: list[str], default is None