From 7f55a19ddb418041a4ffe32bd7ef56305726d421 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Fri, 14 Jun 2024 13:36:31 -0700 Subject: [PATCH 1/8] add more common file suffixes --- virtualizarr/kerchunk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 9424ce40..ab5e334c 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -121,7 +121,7 @@ def _automatically_determine_filetype( filepath=filepath, reader_options=reader_options ) - if file_extension == ".nc": + if file_extension in [".nc",".nc4",".hdf",".h5"]: # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167 magic = fpath.read() @@ -136,7 +136,7 @@ def _automatically_determine_filetype( raise NotImplementedError() elif file_extension == ".grib": filetype = FileType.grib - elif file_extension == ".tiff": + elif file_extension in [".tif",".tiff"]: filetype = FileType.tiff elif file_extension == ".fits": filetype = FileType.fits From d8445b02d44986450667f6a8993b6bce0e55f2b8 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Fri, 14 Jun 2024 15:54:12 -0700 Subject: [PATCH 2/8] determine format based on magic bytes --- virtualizarr/kerchunk.py | 41 ++++++++++++++--------------- virtualizarr/tests/test_kerchunk.py | 4 +-- virtualizarr/tests/test_xarray.py | 2 +- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index ab5e334c..4d7e2d69 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -33,7 +33,7 @@ def _generate_next_value_(name, start, count, last_values): class FileType(AutoName): netcdf3 = auto() - netcdf4 = auto() + hdf5 = auto() grib = auto() tiff = auto() fits = auto() @@ -86,7 +86,7 @@ def read_kerchunk_references_from_file( refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() - elif filetype.name.lower() == "netcdf4": + elif filetype.name.lower() == "hdf5": from kerchunk.hdf import SingleHdf5ToZarr refs = SingleHdf5ToZarr( @@ -116,34 +116,33 @@ def _automatically_determine_filetype( filepath: str, reader_options: Optional[dict[str, Any]] = None, ) -> FileType: - file_extension = Path(filepath).suffix + + if Path(filepath).suffix == ".zarr": + # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... + raise NotImplementedError() + + # Read magic bytes from local or remote file fpath = _fsspec_openfile_from_filepath( filepath=filepath, reader_options=reader_options ) + magic_bytes = fpath.read(8) + fpath.close() - if file_extension in [".nc",".nc4",".hdf",".h5"]: - # based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167 - magic = fpath.read() - - if magic[0:3] == b"CDF": - filetype = FileType.netcdf3 - elif magic[1:4] == b"HDF": - filetype = FileType.netcdf4 - else: - raise ValueError(".nc file does not appear to be NETCDF3 OR NETCDF4") - elif file_extension == ".zarr": - # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... - raise NotImplementedError() - elif file_extension == ".grib": + if magic_bytes.startswith(b"CDF"): + filetype = FileType.netcdf3 + elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): + raise NotImplementedError(f"HDF4 formatted files not supported") + elif magic_bytes.startswith(b"\x89HDF"): + filetype = FileType.hdf5 + elif magic_bytes.startswith(b'GRIB'): filetype = FileType.grib - elif file_extension in [".tif",".tiff"]: + elif magic_bytes.startswith(b'II*'): filetype = FileType.tiff - elif file_extension == ".fits": + elif magic_bytes.startswith(b'SIMPLE'): filetype = FileType.fits else: - raise NotImplementedError(f"Unrecognised file extension: {file_extension}") + raise NotImplementedError(f"Unrecognised file based on header bytes: {magic_bytes}") - fpath.close() return filetype diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index c255a8d8..9be1217d 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -217,7 +217,7 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): assert FileType("netcdf3") == _automatically_determine_filetype( filepath=netcdf3_file_path ) - assert FileType("netcdf4") == _automatically_determine_filetype( + assert FileType("hdf5") == _automatically_determine_filetype( filepath=netcdf4_file_path ) @@ -225,7 +225,7 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): def test_FileType(): # tests if FileType converts user supplied strings to correct filetype assert "netcdf3" == FileType("netcdf3").name - assert "netcdf4" == FileType("netcdf4").name + assert "hdf5" == FileType("hdf5").name assert "grib" == FileType("grib").name assert "tiff" == FileType("tiff").name assert "fits" == FileType("fits").name diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 695759bd..40bd3031 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -274,7 +274,7 @@ def test_combine_by_coords(self, netcdf4_files): @pytest.mark.parametrize( - "filetype", ["netcdf4", None], ids=["netcdf4 filetype", "None filetype"] + "filetype", ["hdf5", None], ids=["netcdf4 filetype", "None filetype"] ) @pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"]) def test_anon_read_s3(filetype, indexes): From 056a583e9556479645ce2600e79036f7b30c74f0 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 25 Jun 2024 09:35:00 -0700 Subject: [PATCH 3/8] add release note --- docs/releases.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/releases.rst b/docs/releases.rst index d3088147..1594b8c9 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -43,6 +43,8 @@ Internal Changes (:pull:`107`) By `Tom Nicholas `_. - Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest. (:pull:`144`) By `Tom Nicholas `_. + - Determine file format from magic bytes rather than name suffix + (:pull:`143`) By `Scott Henderson `_. .. _v0.1: From 68f640cbcb1518c7a4c0d4c2f09ed4c31823cc5c Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 25 Jun 2024 09:49:03 -0700 Subject: [PATCH 4/8] update docstring, keep netCDF4 alias --- virtualizarr/kerchunk.py | 3 ++- virtualizarr/tests/test_kerchunk.py | 3 ++- virtualizarr/xarray.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 58dd252c..b6687584 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -33,6 +33,7 @@ def _generate_next_value_(name, start, count, last_values): class FileType(AutoName): netcdf3 = auto() + netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 hdf5 = auto() grib = auto() tiff = auto() @@ -86,7 +87,7 @@ def read_kerchunk_references_from_file( refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate() - elif filetype.name.lower() == "hdf5": + elif filetype.name.lower() == "hdf5" or filetype.name.lower() == "netcdf4": from kerchunk.hdf import SingleHdf5ToZarr refs = SingleHdf5ToZarr( diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 3e4bc931..cbe9bd54 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -223,7 +223,7 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): assert FileType("netcdf3") == _automatically_determine_filetype( filepath=netcdf3_file_path ) - assert FileType("hdf5") == _automatically_determine_filetype( + assert FileType("netcdf4") == _automatically_determine_filetype( filepath=netcdf4_file_path ) @@ -231,6 +231,7 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): def test_FileType(): # tests if FileType converts user supplied strings to correct filetype assert "netcdf3" == FileType("netcdf3").name + assert "netcdf4" == FileType("netcdf4").name assert "hdf5" == FileType("hdf5").name assert "grib" == FileType("grib").name assert "tiff" == FileType("tiff").name diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index d8bf8609..be2bf09c 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -55,8 +55,8 @@ def open_virtual_dataset( File path to open as a set of virtualized zarr arrays. filetype : FileType, default None Type of file to be opened. Used to determine which kerchunk file format backend to use. - Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}. - If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. + Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}. + If not provided will attempt to automatically infer the correct filetype from header bytes. drop_variables: list[str], default is None Variables in the file to drop before returning. loadable_variables: list[str], default is None From a9961b669cb9459902576986afb22c1cd87b1026 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 25 Jun 2024 13:11:41 -0700 Subject: [PATCH 5/8] small contributing section, update env --- ci/environment.yml | 2 ++ docs/contributing.md | 10 ++++++++++ pyproject.toml | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/ci/environment.yml b/ci/environment.yml index 0385ea5a..1b122e3f 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -3,6 +3,7 @@ channels: - conda-forge - nodefaults dependencies: + - astropy # for FITS - h5netcdf - h5py - hdf5 @@ -14,6 +15,7 @@ dependencies: - ujson - packaging - universal_pathlib + - tiffile # for TIFF # Testing - codecov - pre-commit diff --git a/docs/contributing.md b/docs/contributing.md index e617db25..dc6a73ec 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,5 +1,15 @@ # Contributing +## Contributing code + +```bash +mamba env create -f ci/environment.yml +mamba activate virtualizarr-tests +# git checkout -b new-feature +python -m pip install -e . --no-deps +python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose +``` + ## Contributing documentation ### Build the documentation locally diff --git a/pyproject.toml b/pyproject.toml index 075059cc..fd13d390 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,3 +114,8 @@ line-ending = "auto" [tool.ruff.lint.isort] known-first-party = ["virtualizarr"] + +[tool.pytest.ini_options] +markers = [ + "network: marks test requiring internet (select with '--run-network-tests')", +] \ No newline at end of file From ca1851e0bdc536d304ecbf1cb8e3df0a42ae0a8f Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 25 Jun 2024 13:11:50 -0700 Subject: [PATCH 6/8] add tests --- virtualizarr/kerchunk.py | 5 ++-- virtualizarr/tests/test_kerchunk.py | 23 ++++++++++++++- virtualizarr/tests/test_xarray.py | 46 +++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index b6687584..e9b96e64 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -34,6 +34,7 @@ def _generate_next_value_(name, start, count, last_values): class FileType(AutoName): netcdf3 = auto() netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 + hdf4 = auto() hdf5 = auto() grib = auto() tiff = auto() @@ -100,11 +101,11 @@ def read_kerchunk_references_from_file( elif filetype.name.lower() == "tiff": from kerchunk.tiff import tiff_to_zarr - refs = tiff_to_zarr(filepath, inline_threshold=0, **reader_options) + refs = tiff_to_zarr(filepath, **reader_options) elif filetype.name.lower() == "fits": from kerchunk.fits import process_file - refs = process_file(filepath, inline_threshold=0, **reader_options) + refs = process_file(filepath, **reader_options) else: raise NotImplementedError(f"Unsupported file type: {filetype.name}") diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index cbe9bd54..c7a33cbb 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -223,15 +223,36 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): assert FileType("netcdf3") == _automatically_determine_filetype( filepath=netcdf3_file_path ) - assert FileType("netcdf4") == _automatically_determine_filetype( + assert FileType("hdf5") == _automatically_determine_filetype( filepath=netcdf4_file_path ) +@pytest.mark.parametrize("filetype,headerbytes", [("netcdf3", b"CDF"), + ("hdf5", b"\x89HDF"), + ("grib", b"GRIB"), + ("tiff", b"II*"), + ("fits", b"SIMPLE")]) +def test_valid_filetype_bytes(tmp_path, filetype, headerbytes): + filepath = tmp_path / "file.abc" + with open(filepath, 'wb') as f: + f.write(headerbytes) + assert FileType(filetype) == _automatically_determine_filetype( + filepath=filepath + ) + +def test_notimplemented_filetype(tmp_path): + for headerbytes in [b"JUNK", b"\x0e\x03\x13\x01"]: + filepath = tmp_path / "file.abc" + with open(filepath, 'wb') as f: + f.write(headerbytes) + with pytest.raises(NotImplementedError): + _automatically_determine_filetype(filepath=filepath) def test_FileType(): # tests if FileType converts user supplied strings to correct filetype assert "netcdf3" == FileType("netcdf3").name assert "netcdf4" == FileType("netcdf4").name + assert "hdf4" == FileType("hdf4").name assert "hdf5" == FileType("hdf5").name assert "grib" == FileType("grib").name assert "tiff" == FileType("tiff").name diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index e55583bf..1d4ae44a 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -308,6 +308,40 @@ def test_anon_read_s3(self, filetype, indexes): assert isinstance(vds[var].data, ManifestArray), var +@network +class TestReadFromURL: + def test_read_from_url(self): + examples = { + 'grib':'https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib', + + 'netcdf3':'https://github.com/pydata/xarray-data/raw/master/air_temperature.nc', + 'netcdf4':'https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc', + + 'hdf4':'https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf', + + # https://github.com/zarr-developers/VirtualiZarr/issues/159 + # https://nisar.jpl.nasa.gov/data/sample-data/ + #'hdf5':'https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/Soil_Moisture/ALOS-2/NISAR_L3_PR_SME2_001_008_D_070_4000_QPNA_A_20190829T180759_20190829T180809_P01101_M_P_J_001.h5', + #'hdf5':'https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5', + + # https://github.com/zarr-developers/VirtualiZarr/issues/160 + #'tiff':'https://github.com/corteva/rioxarray/raw/master/test/test_data/input/cog.tif', + #'tiff':'https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif', + # https://github.com/astropy/astropy/blob/4d034aa7e27e31cb0241cc01bbe76eab47406a91/astropy/io/fits/tests/test_fsspec.py#L73 + #'fits':'https://mast.stsci.edu/api/v0.1/Download/file/?uri=mast:HST/product/ibxl50020_jif.fits', + # https://github.com/fsspec/kerchunk/blob/ae692fead51a216691e4db9a67c99194c5ba8e14/kerchunk/tests/test_fits.py#L18 + #'fits':'https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits', + + 'jpg': 'https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg', + } + + for filetype, url in examples.items(): + if filetype in ['grib','jpg','hdf4']: + with pytest.raises(NotImplementedError): + vds = open_virtual_dataset(url, reader_options={}) + else: + vds = open_virtual_dataset(url, reader_options={}) + class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): vars_to_load = ["air", "time"] @@ -325,6 +359,18 @@ def test_loadable_variables(self, netcdf4_file): if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + def test_explicit_filetype(self, netcdf4_file): + # NOTE: not sure of best way to check these VDS are identical + #vds1 = open_virtual_dataset(netcdf4_file) + #vds2 = open_virtual_dataset(netcdf4_file, filetype="netcdf4") + #vds2 = open_virtual_dataset(netcdf4_file, filetype="hdf5") + + with pytest.raises(ValueError): + vds = open_virtual_dataset(netcdf4_file, filetype="unknown") + + with pytest.raises(NotImplementedError): + vds = open_virtual_dataset(netcdf4_file, filetype="grib") + @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") def test_open_virtual_dataset_passes_expected_args( self, mock_read_kerchunk, netcdf4_file From f8bbc6bfda962b0954c915ef9cf741f491565f27 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 25 Jun 2024 13:23:31 -0700 Subject: [PATCH 7/8] ruff --- docs/contributing.md | 1 + virtualizarr/kerchunk.py | 17 +++++------ virtualizarr/tests/test_kerchunk.py | 26 ++++++++++------- virtualizarr/tests/test_xarray.py | 44 +++++++++-------------------- 4 files changed, 40 insertions(+), 48 deletions(-) diff --git a/docs/contributing.md b/docs/contributing.md index dc6a73ec..4028dcaf 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -5,6 +5,7 @@ ```bash mamba env create -f ci/environment.yml mamba activate virtualizarr-tests +pre-commit install # git checkout -b new-feature python -m pip install -e . --no-deps python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index e9b96e64..46b4f2b4 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -33,7 +33,7 @@ def _generate_next_value_(name, start, count, last_values): class FileType(AutoName): netcdf3 = auto() - netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 + netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5 hdf4 = auto() hdf5 = auto() grib = auto() @@ -118,11 +118,10 @@ def _automatically_determine_filetype( filepath: str, reader_options: Optional[dict[str, Any]] = None, ) -> FileType: - if Path(filepath).suffix == ".zarr": # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... raise NotImplementedError() - + # Read magic bytes from local or remote file fpath = _fsspec_openfile_from_filepath( filepath=filepath, reader_options=reader_options @@ -133,17 +132,19 @@ def _automatically_determine_filetype( if magic_bytes.startswith(b"CDF"): filetype = FileType.netcdf3 elif magic_bytes.startswith(b"\x0e\x03\x13\x01"): - raise NotImplementedError(f"HDF4 formatted files not supported") + raise NotImplementedError("HDF4 formatted files not supported") elif magic_bytes.startswith(b"\x89HDF"): filetype = FileType.hdf5 - elif magic_bytes.startswith(b'GRIB'): + elif magic_bytes.startswith(b"GRIB"): filetype = FileType.grib - elif magic_bytes.startswith(b'II*'): + elif magic_bytes.startswith(b"II*"): filetype = FileType.tiff - elif magic_bytes.startswith(b'SIMPLE'): + elif magic_bytes.startswith(b"SIMPLE"): filetype = FileType.fits else: - raise NotImplementedError(f"Unrecognised file based on header bytes: {magic_bytes}") + raise NotImplementedError( + f"Unrecognised file based on header bytes: {magic_bytes}" + ) return filetype diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index c7a33cbb..22d6d7df 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -227,27 +227,33 @@ def test_automatically_determine_filetype_netcdf3_netcdf4(): filepath=netcdf4_file_path ) -@pytest.mark.parametrize("filetype,headerbytes", [("netcdf3", b"CDF"), - ("hdf5", b"\x89HDF"), - ("grib", b"GRIB"), - ("tiff", b"II*"), - ("fits", b"SIMPLE")]) + +@pytest.mark.parametrize( + "filetype,headerbytes", + [ + ("netcdf3", b"CDF"), + ("hdf5", b"\x89HDF"), + ("grib", b"GRIB"), + ("tiff", b"II*"), + ("fits", b"SIMPLE"), + ], +) def test_valid_filetype_bytes(tmp_path, filetype, headerbytes): filepath = tmp_path / "file.abc" - with open(filepath, 'wb') as f: + with open(filepath, "wb") as f: f.write(headerbytes) - assert FileType(filetype) == _automatically_determine_filetype( - filepath=filepath - ) + assert FileType(filetype) == _automatically_determine_filetype(filepath=filepath) + def test_notimplemented_filetype(tmp_path): for headerbytes in [b"JUNK", b"\x0e\x03\x13\x01"]: filepath = tmp_path / "file.abc" - with open(filepath, 'wb') as f: + with open(filepath, "wb") as f: f.write(headerbytes) with pytest.raises(NotImplementedError): _automatically_determine_filetype(filepath=filepath) + def test_FileType(): # tests if FileType converts user supplied strings to correct filetype assert "netcdf3" == FileType("netcdf3").name diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 1d4ae44a..d0f8cbce 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -312,35 +312,24 @@ def test_anon_read_s3(self, filetype, indexes): class TestReadFromURL: def test_read_from_url(self): examples = { - 'grib':'https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib', - - 'netcdf3':'https://github.com/pydata/xarray-data/raw/master/air_temperature.nc', - 'netcdf4':'https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc', - - 'hdf4':'https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf', - - # https://github.com/zarr-developers/VirtualiZarr/issues/159 - # https://nisar.jpl.nasa.gov/data/sample-data/ - #'hdf5':'https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/Soil_Moisture/ALOS-2/NISAR_L3_PR_SME2_001_008_D_070_4000_QPNA_A_20190829T180759_20190829T180809_P01101_M_P_J_001.h5', - #'hdf5':'https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5', - - # https://github.com/zarr-developers/VirtualiZarr/issues/160 - #'tiff':'https://github.com/corteva/rioxarray/raw/master/test/test_data/input/cog.tif', - #'tiff':'https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif', - # https://github.com/astropy/astropy/blob/4d034aa7e27e31cb0241cc01bbe76eab47406a91/astropy/io/fits/tests/test_fsspec.py#L73 - #'fits':'https://mast.stsci.edu/api/v0.1/Download/file/?uri=mast:HST/product/ibxl50020_jif.fits', - # https://github.com/fsspec/kerchunk/blob/ae692fead51a216691e4db9a67c99194c5ba8e14/kerchunk/tests/test_fits.py#L18 - #'fits':'https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits', - - 'jpg': 'https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg', + "grib": "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib", + "netcdf3": "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc", + "netcdf4": "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc", + "hdf4": "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", + "hdf5": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5", + "tiff": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", + "fits": "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits", + "jpg": "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg", } - + for filetype, url in examples.items(): - if filetype in ['grib','jpg','hdf4']: + if filetype in ["grib", "jpg", "hdf4"]: with pytest.raises(NotImplementedError): vds = open_virtual_dataset(url, reader_options={}) else: vds = open_virtual_dataset(url, reader_options={}) + assert isinstance(vds, xr.Dataset) + class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): @@ -360,16 +349,11 @@ def test_loadable_variables(self, netcdf4_file): xrt.assert_identical(vds.variables[name], full_ds.variables[name]) def test_explicit_filetype(self, netcdf4_file): - # NOTE: not sure of best way to check these VDS are identical - #vds1 = open_virtual_dataset(netcdf4_file) - #vds2 = open_virtual_dataset(netcdf4_file, filetype="netcdf4") - #vds2 = open_virtual_dataset(netcdf4_file, filetype="hdf5") - with pytest.raises(ValueError): - vds = open_virtual_dataset(netcdf4_file, filetype="unknown") + open_virtual_dataset(netcdf4_file, filetype="unknown") with pytest.raises(NotImplementedError): - vds = open_virtual_dataset(netcdf4_file, filetype="grib") + open_virtual_dataset(netcdf4_file, filetype="grib") @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") def test_open_virtual_dataset_passes_expected_args( From 928efd9729b824ecf1c3de1b797411ca40f24b51 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 25 Jun 2024 15:13:32 -0700 Subject: [PATCH 8/8] disable tif,fits,hdf5 examples for now --- ci/environment.yml | 2 -- virtualizarr/tests/test_xarray.py | 8 +++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index 1b122e3f..0385ea5a 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -3,7 +3,6 @@ channels: - conda-forge - nodefaults dependencies: - - astropy # for FITS - h5netcdf - h5py - hdf5 @@ -15,7 +14,6 @@ dependencies: - ujson - packaging - universal_pathlib - - tiffile # for TIFF # Testing - codecov - pre-commit diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index d0f8cbce..5a0f1d3d 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -316,9 +316,11 @@ def test_read_from_url(self): "netcdf3": "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc", "netcdf4": "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc", "hdf4": "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", - "hdf5": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5", - "tiff": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", - "fits": "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits", + # https://github.com/zarr-developers/VirtualiZarr/issues/159 + # "hdf5": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5", + # https://github.com/zarr-developers/VirtualiZarr/issues/160 + # "tiff": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif", + # "fits": "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits", "jpg": "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg", }