Filesystem spec (FSSpec)#
FSSpec is the standard package for reading bytes from various stores in the Python/data ecosystem. Intake uses FSSpec as part of their backend.
You can store data at so many places. Local, google drive, Amazon AWS etc. Filesystem Spec (fsspec) is a project to provide a unified pythonic interface to local, remote and embedded file systems and bytes storage. So, all we need is to provide credentials for each service.
For usage purpose, their documentation is more than enough.
Documentation: link
Basic examples#
Local file system without database#
import fsspec
# Write a file
with fsspec.open("file://data/fsspec_sample.txt", mode = 'w') as f:
f.write("Hello, World!\n")
# Read the file
with fsspec.open("file://data/fsspec_sample.txt", mode = 'r') as f:
print(f.read())
Hello, World!
Remote filesystem#
We will use the S3 filesystem which is operated by MinIO on our local PC.
# Connect to S3 using fsspec
fs = fsspec.filesystem("s3",
key = 'minioadmin',
secret = 'minioadmin',
client_kwargs = {'endpoint_url': 'http://localhost:9000'}
)
fs.ls('my-bucket-2')
['my-bucket-2/data', 'my-bucket-2/images']
# Write a file to S3
# Notice the method changed from fsspec.open() to fs.open() when fs is an object that connects to the S3 filesystem.
with fs.open("my-bucket-2/data/fsspec_sample.txt", mode = 'w') as f:
f.write("Hello, World!")
# Read a file from S3
with fs.open("my-bucket-2/data/data.csv", mode = 'r') as f:
print(f.read())
with fs.open("my-bucket-2/data/fsspec_sample.txt", mode = 'r') as f:
print("\n\n"+ f.read())
Name,Age,City
Alice,25,New York
Bob,30,London
Charlie,35,Paris
Hello, World!
Easily switch between local and remote filesystem#
# Read the file
print("Read local file")
with fsspec.open("file://data/fsspec_sample.txt", mode = 'r') as f:
print(f.read())
print("Read remote file")
with fsspec.open("s3://my-bucket-2/data/fsspec_sample.txt", mode ='r',
key = 'minioadmin',
secret = 'minioadmin',
client_kwargs = {'endpoint_url': 'http://localhost:9000'}) as f:
print(f.read())
Read local file
Hello, World!
Read remote file
Hello, World!
Open zarr file with xarray using fsspec#
import xarray as xr
store = fsspec.get_mapper("s3://my-bucket-2/data/zarr_data",
key = 'minioadmin',
secret = 'minioadmin',
client_kwargs = {'endpoint_url': 'http://localhost:9000'})
dataset = xr.open_zarr(store)
dataset
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr)
214 # Xarray-Zarr
--> 215 dimensions = zarr_obj.attrs[dimension_key]
216 except KeyError as e:
~/anaconda3/lib/python3.9/site-packages/zarr/attrs.py in __getitem__(self, item)
73 def __getitem__(self, item):
---> 74 return self.asdict()[item]
75
KeyError: '_ARRAY_DIMENSIONS'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr)
228 dimensions = [
--> 229 os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]
230 ]
KeyError: '_NCZARR_ARRAY'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/tmp/ipykernel_1945216/2753404152.py in <module>
----> 1 dataset = xr.open_zarr(store)
2 dataset
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in open_zarr(store, group, synchronizer, chunks, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, consolidated, overwrite_encoded_chunks, chunk_store, storage_options, decode_timedelta, use_cftime, zarr_version, chunked_array_type, from_array_kwargs, **kwargs)
1101 }
1102
-> 1103 ds = open_dataset(
1104 filename_or_obj=store,
1105 group=group,
~/anaconda3/lib/python3.9/site-packages/xarray/backends/api.py in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
586
587 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 588 backend_ds = backend.open_dataset(
589 filename_or_obj,
590 drop_variables=drop_variables,
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, stacklevel, zarr_version, store, engine)
1186 store_entrypoint = StoreBackendEntrypoint()
1187 with close_on_error(store):
-> 1188 ds = store_entrypoint.open_dataset(
1189 store,
1190 mask_and_scale=mask_and_scale,
~/anaconda3/lib/python3.9/site-packages/xarray/backends/store.py in open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta)
41 assert isinstance(filename_or_obj, AbstractDataStore)
42
---> 43 vars, attrs = filename_or_obj.load()
44 encoding = filename_or_obj.get_encoding()
45
~/anaconda3/lib/python3.9/site-packages/xarray/backends/common.py in load(self)
219 """
220 variables = FrozenDict(
--> 221 (_decode_variable_name(k), v) for k, v in self.get_variables().items()
222 )
223 attributes = FrozenDict(self.get_attrs())
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in get_variables(self)
561
562 def get_variables(self):
--> 563 return FrozenDict(
564 (k, self.open_store_variable(k, v)) for k, v in self.zarr_group.arrays()
565 )
~/anaconda3/lib/python3.9/site-packages/xarray/core/utils.py in FrozenDict(*args, **kwargs)
434
435 def FrozenDict(*args, **kwargs) -> Frozen:
--> 436 return Frozen(dict(*args, **kwargs))
437
438
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in <genexpr>(.0)
562 def get_variables(self):
563 return FrozenDict(
--> 564 (k, self.open_store_variable(k, v)) for k, v in self.zarr_group.arrays()
565 )
566
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in open_store_variable(self, name, zarr_array)
538 data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array))
539 try_nczarr = self._mode == "r"
--> 540 dimensions, attributes = _get_zarr_dims_and_attrs(
541 zarr_array, DIMENSION_KEY, try_nczarr
542 )
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr)
230 ]
231 except KeyError as e:
--> 232 raise KeyError(
233 f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, "
234 "which are required for xarray to determine variable dimensions."
KeyError: 'Zarr object is missing the attribute `_ARRAY_DIMENSIONS` and the NCZarr metadata, which are required for xarray to determine variable dimensions.'
I just realised the zarr_data was created using zarr library so xarray can’t open it as the metadata is not compatible. Anyways, this the code to open zarr file saved as xarray using fsspec.