Filesystem spec (FSSpec)#

FSSpec is the standard package for reading bytes from various stores in the Python/data ecosystem. Intake uses FSSpec as part of their backend.

You can store data at so many places. Local, google drive, Amazon AWS etc. Filesystem Spec (fsspec) is a project to provide a unified pythonic interface to local, remote and embedded file systems and bytes storage. So, all we need is to provide credentials for each service.

For usage purpose, their documentation is more than enough.

Documentation: link

Basic examples#

Local file system without database#

import fsspec

# Write a file
with fsspec.open("file://data/fsspec_sample.txt", mode = 'w') as f:
    f.write("Hello, World!\n")

# Read the file
with fsspec.open("file://data/fsspec_sample.txt", mode = 'r') as f:
    print(f.read())
Hello, World!

Remote filesystem#

We will use the S3 filesystem which is operated by MinIO on our local PC.

# Connect to S3 using fsspec

fs = fsspec.filesystem("s3",
                       key = 'minioadmin',
                       secret = 'minioadmin',
                       client_kwargs = {'endpoint_url': 'http://localhost:9000'}
                       )

fs.ls('my-bucket-2')
['my-bucket-2/data', 'my-bucket-2/images']
# Write a file to S3
# Notice the method changed from fsspec.open() to fs.open() when fs is an object that connects to the S3 filesystem.
with fs.open("my-bucket-2/data/fsspec_sample.txt", mode = 'w') as f:
    f.write("Hello, World!")
# Read a file from S3
with fs.open("my-bucket-2/data/data.csv", mode = 'r') as f:
    print(f.read())


with fs.open("my-bucket-2/data/fsspec_sample.txt", mode = 'r') as f:
    print("\n\n"+ f.read())
Name,Age,City
Alice,25,New York
Bob,30,London
Charlie,35,Paris



Hello, World!

Easily switch between local and remote filesystem#

# Read the file
print("Read local file")
with fsspec.open("file://data/fsspec_sample.txt", mode = 'r') as f:
    print(f.read())

print("Read remote file")
with fsspec.open("s3://my-bucket-2/data/fsspec_sample.txt", mode ='r',
                 key = 'minioadmin',
                       secret = 'minioadmin',
                       client_kwargs = {'endpoint_url': 'http://localhost:9000'}) as f:
    print(f.read())
Read local file
Hello, World!

Read remote file
Hello, World!

Open zarr file with xarray using fsspec#

import xarray as xr

store = fsspec.get_mapper("s3://my-bucket-2/data/zarr_data",
                    key = 'minioadmin',
                    secret = 'minioadmin',
                    client_kwargs = {'endpoint_url': 'http://localhost:9000'})
dataset = xr.open_zarr(store)
dataset
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr)
    214         # Xarray-Zarr
--> 215         dimensions = zarr_obj.attrs[dimension_key]
    216     except KeyError as e:

~/anaconda3/lib/python3.9/site-packages/zarr/attrs.py in __getitem__(self, item)
     73     def __getitem__(self, item):
---> 74         return self.asdict()[item]
     75 

KeyError: '_ARRAY_DIMENSIONS'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr)
    228             dimensions = [
--> 229                 os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]
    230             ]

KeyError: '_NCZARR_ARRAY'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
/tmp/ipykernel_1945216/2753404152.py in <module>
----> 1 dataset = xr.open_zarr(store)
      2 dataset

~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in open_zarr(store, group, synchronizer, chunks, decode_cf, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, consolidated, overwrite_encoded_chunks, chunk_store, storage_options, decode_timedelta, use_cftime, zarr_version, chunked_array_type, from_array_kwargs, **kwargs)
   1101     }
   1102 
-> 1103     ds = open_dataset(
   1104         filename_or_obj=store,
   1105         group=group,

~/anaconda3/lib/python3.9/site-packages/xarray/backends/api.py in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
    586 
    587     overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 588     backend_ds = backend.open_dataset(
    589         filename_or_obj,
    590         drop_variables=drop_variables,

~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, stacklevel, zarr_version, store, engine)
   1186         store_entrypoint = StoreBackendEntrypoint()
   1187         with close_on_error(store):
-> 1188             ds = store_entrypoint.open_dataset(
   1189                 store,
   1190                 mask_and_scale=mask_and_scale,

~/anaconda3/lib/python3.9/site-packages/xarray/backends/store.py in open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta)
     41         assert isinstance(filename_or_obj, AbstractDataStore)
     42 
---> 43         vars, attrs = filename_or_obj.load()
     44         encoding = filename_or_obj.get_encoding()
     45 

~/anaconda3/lib/python3.9/site-packages/xarray/backends/common.py in load(self)
    219         """
    220         variables = FrozenDict(
--> 221             (_decode_variable_name(k), v) for k, v in self.get_variables().items()
    222         )
    223         attributes = FrozenDict(self.get_attrs())

~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in get_variables(self)
    561 
    562     def get_variables(self):
--> 563         return FrozenDict(
    564             (k, self.open_store_variable(k, v)) for k, v in self.zarr_group.arrays()
    565         )

~/anaconda3/lib/python3.9/site-packages/xarray/core/utils.py in FrozenDict(*args, **kwargs)
    434 
    435 def FrozenDict(*args, **kwargs) -> Frozen:
--> 436     return Frozen(dict(*args, **kwargs))
    437 
    438 

~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in <genexpr>(.0)
    562     def get_variables(self):
    563         return FrozenDict(
--> 564             (k, self.open_store_variable(k, v)) for k, v in self.zarr_group.arrays()
    565         )
    566 

~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in open_store_variable(self, name, zarr_array)
    538         data = indexing.LazilyIndexedArray(ZarrArrayWrapper(zarr_array))
    539         try_nczarr = self._mode == "r"
--> 540         dimensions, attributes = _get_zarr_dims_and_attrs(
    541             zarr_array, DIMENSION_KEY, try_nczarr
    542         )

~/anaconda3/lib/python3.9/site-packages/xarray/backends/zarr.py in _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr)
    230             ]
    231         except KeyError as e:
--> 232             raise KeyError(
    233                 f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, "
    234                 "which are required for xarray to determine variable dimensions."

KeyError: 'Zarr object is missing the attribute `_ARRAY_DIMENSIONS` and the NCZarr metadata, which are required for xarray to determine variable dimensions.'

I just realised the zarr_data was created using zarr library so xarray can’t open it as the metadata is not compatible. Anyways, this the code to open zarr file saved as xarray using fsspec.