Real world use case: Virtualizarring CMIP6 data #93

jbusecke · 2024-04-25T03:52:07Z

Note

All of this is currently based on a dev branch which represents a merge of main into #67

Motivated to come up with a proof of concept until tomorrow for the ESGF conference I am at right now, I am trying to test Virtualizarr on real world CMIP6 data on s3 (a complex example for #61)

I am running the following:

from virtualizarr import open_virtual_dataset
import xarray as xr
files = [
    's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_185001-186012.nc',
    's3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_187101-188012.nc',
]
vds_list = []
for f in files:
    vds = open_virtual_dataset(f, filetype='netCDF4',indexes={})
    vds_list.append(vds)
combined_vds = xr.combine_nested(vds_list, concat_dim=['time'], coords='minimal', compat='override')
combined_vds.virtualize.to_kerchunk('combined.json', format='json')

This works until here, which is really phenomenal. Thanks for the great work here.

But when I try to read from the reference file

import fsspec

fs = fsspec.filesystem("reference", fo=f"combined.json")
mapper = fs.get_mapper("")

combined_ds = xr.open_dataset(mapper, engine="kerchunk")

I get this error:

--------------------------------------------------------------------------- ClientError Traceback (most recent call last) File [/srv/conda/envs/notebook/lib/python3.11/site-packages/s3fs/core.py:113](https://leap.2i2c.cloud/srv/conda/envs/notebook/lib/python3.11/site-packages/s3fs/core.py#line=112), in _error_wrapper(func, args, kwargs, retries) 112 try: --> 113 return await func(*args, **kwargs) 114 except S3_RETRYABLE_ERRORS as e:

File /srv/conda/envs/notebook/lib/python3.11/site-packages/aiobotocore/client.py:408, in AioBaseClient._make_api_call(self, operation_name, api_params)
407 error_class = self.exceptions.from_code(error_code)
--> 408 raise error_class(parsed_response, operation_name)
409 else:

ClientError: An error occurred (InvalidAccessKeyId) when calling the GetObject operation: The AWS Access Key Id you provided does not exist in our records.

The above exception was the direct cause of the following exception:

PermissionError Traceback (most recent call last)
File /srv/conda/envs/notebook/lib/python3.11/site-packages/fsspec/asyn.py:245, in _run_coros_in_chunks.._run_coro(coro, i)
244 try:
--> 245 return await asyncio.wait_for(coro, timeout=timeout), i
246 except Exception as e:

File /srv/conda/envs/notebook/lib/python3.11/asyncio/tasks.py:452, in wait_for(fut, timeout)
451 if timeout is None:
--> 452 return await fut
454 if timeout <= 0:

File /srv/conda/envs/notebook/lib/python3.11/site-packages/s3fs/core.py:1125, in S3FileSystem._cat_file(self, path, version_id, start, end)
1123 resp["Body"].close()
-> 1125 return await _error_wrapper(_call_and_read, retries=self.retries)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/s3fs/core.py:142, in _error_wrapper(func, args, kwargs, retries)
141 err = translate_boto_error(err)
--> 142 raise err

File /srv/conda/envs/notebook/lib/python3.11/site-packages/s3fs/core.py:113, in _error_wrapper(func, args, kwargs, retries)
112 try:
--> 113 return await func(*args, **kwargs)
114 except S3_RETRYABLE_ERRORS as e:

File /srv/conda/envs/notebook/lib/python3.11/site-packages/s3fs/core.py:1112, in S3FileSystem._cat_file.._call_and_read()
1111 async def _call_and_read():
-> 1112 resp = await self._call_s3(
1113 "get_object",
1114 Bucket=bucket,
1115 Key=key,
1116 **version_id_kw(version_id or vers),
1117 **head,
1118 **self.req_kw,
1119 )
1120 try:

File /srv/conda/envs/notebook/lib/python3.11/site-packages/s3fs/core.py:362, in S3FileSystem._call_s3(self, method, *akwarglist, **kwargs)
361 additional_kwargs = self._get_s3_method_kwargs(method, *akwarglist, **kwargs)
--> 362 return await _error_wrapper(
363 method, kwargs=additional_kwargs, retries=self.retries
364 )

File /srv/conda/envs/notebook/lib/python3.11/site-packages/s3fs/core.py:142, in _error_wrapper(func, args, kwargs, retries)
141 err = translate_boto_error(err)
--> 142 raise err

PermissionError: The AWS Access Key Id you provided does not exist in our records.

The above exception was the direct cause of the following exception:

ReferenceNotReachable Traceback (most recent call last)
Cell In[17], line 6
3 fs = fsspec.filesystem("reference", fo=f"combined.json", anon=True)
4 mapper = fs.get_mapper("")
----> 6 combined_ds = xr.open_dataset(mapper, engine="kerchunk")

File /srv/conda/envs/notebook/lib/python3.11/site-packages/xarray/backends/api.py:573, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
561 decoders = _resolve_decoders_kwargs(
562 decode_cf,
563 open_backend_dataset_parameters=backend.open_dataset_parameters,
(...)
569 decode_coords=decode_coords,
570 )
572 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 573 backend_ds = backend.open_dataset(
574 filename_or_obj,
575 drop_variables=drop_variables,
576 **decoders,
577 **kwargs,
578 )
579 ds = _dataset_from_backend_dataset(
580 backend_ds,
581 filename_or_obj,
(...)
591 **kwargs,
592 )
593 return ds

File /srv/conda/envs/notebook/lib/python3.11/site-packages/kerchunk/xarray_backend.py:17, in KerchunkBackend.open_dataset(self, filename_or_obj, drop_variables, storage_options, open_dataset_options)
8 def open_dataset(
9 self,
10 filename_or_obj,
(...)
14 open_dataset_options=None
15 ):
---> 17 ref_ds = open_reference_dataset(
18 filename_or_obj,
19 storage_options=storage_options,
20 open_dataset_options=open_dataset_options,
21 )
22 if drop_variables is not None:
23 ref_ds = ref_ds.drop_vars(drop_variables)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/kerchunk/xarray_backend.py:51, in open_reference_dataset(filename_or_obj, storage_options, open_dataset_options)
48 if open_dataset_options is None:
49 open_dataset_options = {}
---> 51 m = fsspec.get_mapper("reference://", fo=filename_or_obj, **storage_options)
53 return xr.open_dataset(m, engine="zarr", consolidated=False, **open_dataset_options)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/fsspec/mapping.py:249, in get_mapper(url, check, create, missing_exceptions, alternate_root, **kwargs)
218 """Create key-value interface for given URL and options
219
220 The URL will be of the form "protocol://location" and point to the root
(...)
246 FSMap instance, the dict-like key-value store.
247 """
248 # Removing protocol here - could defer to each open() on the backend
--> 249 fs, urlpath = url_to_fs(url, **kwargs)
250 root = alternate_root if alternate_root is not None else urlpath
251 return FSMap(root, fs, check, create, missing_exceptions=missing_exceptions)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/fsspec/core.py:395, in url_to_fs(url, **kwargs)
393 inkwargs["fo"] = urls
394 urlpath, protocol, _ = chain[0]
--> 395 fs = filesystem(protocol, **inkwargs)
396 return fs, urlpath

File /srv/conda/envs/notebook/lib/python3.11/site-packages/fsspec/registry.py:293, in filesystem(protocol, **storage_options)
286 warnings.warn(
287 "The 'arrow_hdfs' protocol has been deprecated and will be "
288 "removed in the future. Specify it as 'hdfs'.",
289 DeprecationWarning,
290 )
292 cls = get_filesystem_class(protocol)
--> 293 return cls(**storage_options)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/fsspec/spec.py:80, in _Cached.call(cls, *args, **kwargs)
78 return cls._cache[token]
79 else:
---> 80 obj = super().call(*args, **kwargs)
81 # Setting _fs_token here causes some static linters to complain.
82 obj.fs_token = token

File /srv/conda/envs/notebook/lib/python3.11/site-packages/fsspec/implementations/reference.py:713, in ReferenceFileSystem.init(self, fo, target, ref_storage_args, target_protocol, target_options, remote_protocol, remote_options, fs, template_overrides, simple_templates, max_gap, max_block, cache_size, **kwargs)
709 self.fss[protocol] = fs
710 if remote_protocol is None:
711 # get single protocol from references
712 # TODO: warning here, since this can be very expensive?
--> 713 for ref in self.references.values():
714 if callable(ref):
715 ref = ref()

File :880, in iter(self)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/fsspec/mapping.py:155, in FSMap.getitem(self, key, default)
153 k = self._key_to_str(key)
154 try:
--> 155 result = self.fs.cat(k)
156 except self.missing_exceptions:
157 if default is not None:

File /srv/conda/envs/notebook/lib/python3.11/site-packages/fsspec/implementations/reference.py:917, in ReferenceFileSystem.cat(self, path, recursive, on_error, **kwargs)
915 new_ex.cause = ex
916 if on_error == "raise":
--> 917 raise new_ex
918 elif on_error != "omit":
919 out[k] = new_ex

ReferenceNotReachable: Reference "i/0" failed to fetch target ['s3://esgf-world/CMIP6/CMIP/CCCma/CanESM5/historical/r10i1p1f1/Omon/uo/gn/v20190429/uo_Omon_CanESM5_historical_r10i1p1f1_gn_185001-186012.nc', 47078, 1440]

To me this indicates that somehow the required storage_options={'anon':True} is not properly passed.

Adding

fs = fsspec.filesystem("reference", fo=f"combined.json", remote_options={'anon':True})

gets around that error but the opening never works. After waiting for 10 minutes I get this trace:

--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) File :6