Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use magic bytes to identify file formats #143

Merged
merged 9 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/contributing.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Contributing

## Contributing code

```bash
mamba env create -f ci/environment.yml
mamba activate virtualizarr-tests
pre-commit install
# git checkout -b new-feature
python -m pip install -e . --no-deps
python -m pytest ./virtualizarr --run-network-tests --cov=./ --cov-report=xml --verbose
```

## Contributing documentation

### Build the documentation locally
Expand Down
2 changes: 2 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ Internal Changes
(:pull:`107`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Mark tests which require network access so that they are only run when `--run-network-tests` is passed a command-line argument to pytest.
(:pull:`144`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Determine file format from magic bytes rather than name suffix
(:pull:`143`) By `Scott Henderson <https://github.com/scottyhq>`_.

.. _v0.1:

Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,8 @@ line-ending = "auto"

[tool.ruff.lint.isort]
known-first-party = ["virtualizarr"]

[tool.pytest.ini_options]
markers = [
"network: marks test requiring internet (select with '--run-network-tests')",
]
48 changes: 25 additions & 23 deletions virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def _generate_next_value_(name, start, count, last_values):

class FileType(AutoName):
netcdf3 = auto()
netcdf4 = auto()
netcdf4 = auto() # NOTE: netCDF4 is a subset of hdf5
hdf4 = auto()
hdf5 = auto()
grib = auto()
tiff = auto()
fits = auto()
Expand Down Expand Up @@ -86,7 +88,7 @@ def read_kerchunk_references_from_file(

refs = NetCDF3ToZarr(filepath, inline_threshold=0, **reader_options).translate()

elif filetype.name.lower() == "netcdf4":
elif filetype.name.lower() == "hdf5" or filetype.name.lower() == "netcdf4":
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

keeping netcdf4 as an alias for hdf5 because i've seen code in reported issues using filetype=netcdf4

from kerchunk.hdf import SingleHdf5ToZarr

refs = SingleHdf5ToZarr(
Expand All @@ -99,11 +101,11 @@ def read_kerchunk_references_from_file(
elif filetype.name.lower() == "tiff":
from kerchunk.tiff import tiff_to_zarr

refs = tiff_to_zarr(filepath, inline_threshold=0, **reader_options)
refs = tiff_to_zarr(filepath, **reader_options)
elif filetype.name.lower() == "fits":
from kerchunk.fits import process_file

refs = process_file(filepath, inline_threshold=0, **reader_options)
refs = process_file(filepath, **reader_options)
else:
raise NotImplementedError(f"Unsupported file type: {filetype.name}")

Expand All @@ -116,34 +118,34 @@ def _automatically_determine_filetype(
filepath: str,
reader_options: Optional[dict[str, Any]] = None,
) -> FileType:
file_extension = Path(filepath).suffix
if Path(filepath).suffix == ".zarr":
# TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one...
raise NotImplementedError()

# Read magic bytes from local or remote file
fpath = _fsspec_openfile_from_filepath(
filepath=filepath, reader_options=reader_options
)
magic_bytes = fpath.read(8)
fpath.close()

if file_extension == ".nc":
# based off of: https://github.com/TomNicholas/VirtualiZarr/pull/43#discussion_r1543415167
magic = fpath.read()

if magic[0:3] == b"CDF":
filetype = FileType.netcdf3
elif magic[1:4] == b"HDF":
filetype = FileType.netcdf4
else:
raise ValueError(".nc file does not appear to be NETCDF3 OR NETCDF4")
elif file_extension == ".zarr":
# TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one...
raise NotImplementedError()
elif file_extension == ".grib":
if magic_bytes.startswith(b"CDF"):
filetype = FileType.netcdf3
elif magic_bytes.startswith(b"\x0e\x03\x13\x01"):
raise NotImplementedError("HDF4 formatted files not supported")
elif magic_bytes.startswith(b"\x89HDF"):
filetype = FileType.hdf5
elif magic_bytes.startswith(b"GRIB"):
filetype = FileType.grib
elif file_extension == ".tiff":
elif magic_bytes.startswith(b"II*"):
filetype = FileType.tiff
elif file_extension == ".fits":
elif magic_bytes.startswith(b"SIMPLE"):
filetype = FileType.fits
else:
raise NotImplementedError(f"Unrecognised file extension: {file_extension}")
raise NotImplementedError(
f"Unrecognised file based on header bytes: {magic_bytes}"
)

fpath.close()
return filetype


Expand Down
30 changes: 29 additions & 1 deletion virtualizarr/tests/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,15 +223,43 @@ def test_automatically_determine_filetype_netcdf3_netcdf4():
assert FileType("netcdf3") == _automatically_determine_filetype(
filepath=netcdf3_file_path
)
assert FileType("netcdf4") == _automatically_determine_filetype(
assert FileType("hdf5") == _automatically_determine_filetype(
filepath=netcdf4_file_path
)


@pytest.mark.parametrize(
"filetype,headerbytes",
[
("netcdf3", b"CDF"),
("hdf5", b"\x89HDF"),
("grib", b"GRIB"),
("tiff", b"II*"),
("fits", b"SIMPLE"),
],
)
def test_valid_filetype_bytes(tmp_path, filetype, headerbytes):
filepath = tmp_path / "file.abc"
with open(filepath, "wb") as f:
f.write(headerbytes)
assert FileType(filetype) == _automatically_determine_filetype(filepath=filepath)


def test_notimplemented_filetype(tmp_path):
for headerbytes in [b"JUNK", b"\x0e\x03\x13\x01"]:
filepath = tmp_path / "file.abc"
with open(filepath, "wb") as f:
f.write(headerbytes)
with pytest.raises(NotImplementedError):
_automatically_determine_filetype(filepath=filepath)


def test_FileType():
# tests if FileType converts user supplied strings to correct filetype
assert "netcdf3" == FileType("netcdf3").name
assert "netcdf4" == FileType("netcdf4").name
assert "hdf4" == FileType("hdf4").name
assert "hdf5" == FileType("hdf5").name
assert "grib" == FileType("grib").name
assert "tiff" == FileType("tiff").name
assert "fits" == FileType("fits").name
Expand Down
32 changes: 32 additions & 0 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,31 @@ def test_anon_read_s3(self, filetype, indexes):
assert isinstance(vds[var].data, ManifestArray), var


@network
class TestReadFromURL:
def test_read_from_url(self):
examples = {
"grib": "https://github.com/pydata/xarray-data/raw/master/era5-2mt-2019-03-uk.grib",
"netcdf3": "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc",
"netcdf4": "https://github.com/pydata/xarray-data/raw/master/ROMS_example.nc",
"hdf4": "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
# https://github.com/zarr-developers/VirtualiZarr/issues/159
# "hdf5": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5",
# https://github.com/zarr-developers/VirtualiZarr/issues/160
# "tiff": "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif",
# "fits": "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits",
Comment on lines +319 to +323
Copy link
Contributor Author

@scottyhq scottyhq Jun 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commenting these for now, but thought I'd leave them to track #159 and #160

"jpg": "https://github.com/rasterio/rasterio/raw/main/tests/data/389225main_sw_1965_1024.jpg",
}

for filetype, url in examples.items():
if filetype in ["grib", "jpg", "hdf4"]:
with pytest.raises(NotImplementedError):
vds = open_virtual_dataset(url, reader_options={})
else:
vds = open_virtual_dataset(url, reader_options={})
assert isinstance(vds, xr.Dataset)


class TestLoadVirtualDataset:
def test_loadable_variables(self, netcdf4_file):
vars_to_load = ["air", "time"]
Expand All @@ -325,6 +350,13 @@ def test_loadable_variables(self, netcdf4_file):
if name in vars_to_load:
xrt.assert_identical(vds.variables[name], full_ds.variables[name])

def test_explicit_filetype(self, netcdf4_file):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't sure of best way to check these VDS are identical:

 vds1 = open_virtual_dataset(netcdf4_file) 
 vds2 = open_virtual_dataset(netcdf4_file, filetype="netcdf4") 
 vds2 = open_virtual_dataset(netcdf4_file, filetype="hdf5")

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you open with indexes={}, does xrt.assert_identical work?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't seem to work:

AssertionError: Left and right Dataset objects are not identical

Differing coordinates:
L   ocean_time  (ocean_time) float64 16B ManifestArray<shape=(2,), dtype=floa...
R   ocean_time  (ocean_time) float64 16B ManifestArray<shape=(2,), dtype=floa...
L   s_rho       (s_rho) float64 240B ManifestArray<shape=(30,), dtype=float64...
R   s_rho       (s_rho) float64 240B ManifestArray<shape=(30,), dtype=float64...
Differing data variables:
L   lat_rho     (eta_rho, xi_rho) float64 567kB ManifestArray<shape=(191, 371...
R   lat_rho     (eta_rho, xi_rho) float64 567kB ManifestArray<shape=(191, 371...
L   Cs_r        (s_rho) float64 240B ManifestArray<shape=(30,), dtype=float64...
R   Cs_r        (s_rho) float64 240B ManifestArray<shape=(30,), dtype=float64...
L   h           (eta_rho, xi_rho) float64 567kB ManifestArray<shape=(191, 371...
R   h           (eta_rho, xi_rho) float64 567kB ManifestArray<shape=(191, 371...
L   lon_rho     (eta_rho, xi_rho) float64 567kB ManifestArray<shape=(191, 371...
R   lon_rho     (eta_rho, xi_rho) float64 567kB ManifestArray<shape=(191, 371...
L   hc          float64 8B ManifestArray<shape=(), dtype=float64, chunks=()>
R   hc          float64 8B ManifestArray<shape=(), dtype=float64, chunks=()>
L   salt        (ocean_time, s_rho, eta_rho, xi_rho) float32 17MB ManifestArr...
R   salt        (ocean_time, s_rho, eta_rho, xi_rho) float32 17MB ManifestArr...
L   zeta        (ocean_time, eta_rho, xi_rho) float32 567kB ManifestArray<sha...
R   zeta        (ocean_time, eta_rho, xi_rho) float32 567kB ManifestArray<sha...

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm okay - I've raised #161 to track that.

with pytest.raises(ValueError):
open_virtual_dataset(netcdf4_file, filetype="unknown")

with pytest.raises(NotImplementedError):
open_virtual_dataset(netcdf4_file, filetype="grib")

@patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
def test_open_virtual_dataset_passes_expected_args(
self, mock_read_kerchunk, netcdf4_file
Expand Down
4 changes: 2 additions & 2 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def open_virtual_dataset(
File path to open as a set of virtualized zarr arrays.
filetype : FileType, default None
Type of file to be opened. Used to determine which kerchunk file format backend to use.
Can be one of {'netCDF3', 'netCDF4', 'zarr_v3'}.
If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}.
If not provided will attempt to automatically infer the correct filetype from header bytes.
drop_variables: list[str], default is None
Variables in the file to drop before returning.
loadable_variables: list[str], default is None
Expand Down
Loading