Skip to content

Commit 447c27d

Browse files
Extend JSON serialization capabilities (#1612)
--------- Co-authored-by: Martin Durant <[email protected]>
1 parent bd9fdd5 commit 447c27d

File tree

6 files changed

+252
-62
lines changed

6 files changed

+252
-62
lines changed

fsspec/implementations/cached.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,7 @@ def __getattribute__(self, item):
434434
"__hash__",
435435
"__eq__",
436436
"to_json",
437+
"to_dict",
437438
"cache_size",
438439
"pipe_file",
439440
"pipe",
@@ -506,15 +507,6 @@ def __hash__(self):
506507
^ hash(self.target_protocol)
507508
)
508509

509-
def to_json(self):
510-
"""Calculate JSON representation.
511-
512-
Not implemented yet for CachingFileSystem.
513-
"""
514-
raise NotImplementedError(
515-
"CachingFileSystem JSON representation not implemented"
516-
)
517-
518510

519511
class WholeFileCacheFileSystem(CachingFileSystem):
520512
"""Caches whole remote files on first access

fsspec/implementations/tests/test_cached.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -873,7 +873,7 @@ def test_filecache_with_checks():
873873

874874
@pytest.mark.parametrize("impl", ["filecache", "simplecache", "blockcache"])
875875
@pytest.mark.parametrize("fs", ["local", "multi"], indirect=["fs"])
876-
def test_takes_fs_instance(impl, fs):
876+
def test_filecache_takes_fs_instance(impl, fs):
877877
origin = tempfile.mkdtemp()
878878
data = b"test data"
879879
f1 = os.path.join(origin, "afile")
@@ -885,6 +885,15 @@ def test_takes_fs_instance(impl, fs):
885885
assert fs2.cat(f1) == data
886886

887887

888+
@pytest.mark.parametrize("impl", ["filecache", "simplecache", "blockcache"])
889+
@pytest.mark.parametrize("fs", ["local", "multi"], indirect=["fs"])
890+
def test_filecache_serialization(impl, fs):
891+
fs1 = fsspec.filesystem(impl, fs=fs)
892+
json1 = fs1.to_json()
893+
894+
assert fs1 is fsspec.AbstractFileSystem.from_json(json1)
895+
896+
888897
def test_add_file_to_cache_after_save(local_filecache):
889898
(data, original_file, cache_location, fs) = local_filecache
890899

fsspec/json.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import json
2+
from contextlib import suppress
3+
from pathlib import PurePath
4+
from typing import Any, Callable, Dict, List, Optional, Tuple
5+
6+
from .registry import _import_class, get_filesystem_class
7+
from .spec import AbstractFileSystem
8+
9+
10+
class FilesystemJSONEncoder(json.JSONEncoder):
11+
def default(self, o: Any) -> Any:
12+
if isinstance(o, AbstractFileSystem):
13+
return o.to_dict()
14+
if isinstance(o, PurePath):
15+
cls = type(o)
16+
return {"cls": f"{cls.__module__}.{cls.__name__}", "str": str(o)}
17+
18+
return super().default(o)
19+
20+
21+
class FilesystemJSONDecoder(json.JSONDecoder):
22+
def __init__(
23+
self,
24+
*,
25+
object_hook: Optional[Callable[[Dict[str, Any]], Any]] = None,
26+
parse_float: Optional[Callable[[str], Any]] = None,
27+
parse_int: Optional[Callable[[str], Any]] = None,
28+
parse_constant: Optional[Callable[[str], Any]] = None,
29+
strict: bool = True,
30+
object_pairs_hook: Optional[Callable[[List[Tuple[str, Any]]], Any]] = None,
31+
) -> None:
32+
self.original_object_hook = object_hook
33+
34+
super().__init__(
35+
object_hook=self.custom_object_hook,
36+
parse_float=parse_float,
37+
parse_int=parse_int,
38+
parse_constant=parse_constant,
39+
strict=strict,
40+
object_pairs_hook=object_pairs_hook,
41+
)
42+
43+
@classmethod
44+
def try_resolve_path_cls(cls, dct: Dict[str, Any]):
45+
with suppress(Exception):
46+
fqp = dct["cls"]
47+
48+
path_cls = _import_class(fqp)
49+
50+
if issubclass(path_cls, PurePath):
51+
return path_cls
52+
53+
return None
54+
55+
@classmethod
56+
def try_resolve_fs_cls(cls, dct: Dict[str, Any]):
57+
with suppress(Exception):
58+
if "cls" in dct:
59+
try:
60+
fs_cls = _import_class(dct["cls"])
61+
if issubclass(fs_cls, AbstractFileSystem):
62+
return fs_cls
63+
except Exception:
64+
if "protocol" in dct: # Fallback if cls cannot be imported
65+
return get_filesystem_class(dct["protocol"])
66+
67+
raise
68+
69+
return None
70+
71+
def custom_object_hook(self, dct: Dict[str, Any]):
72+
if "cls" in dct:
73+
if (obj_cls := self.try_resolve_fs_cls(dct)) is not None:
74+
return AbstractFileSystem.from_dict(dct)
75+
if (obj_cls := self.try_resolve_path_cls(dct)) is not None:
76+
return obj_cls(dct["str"])
77+
78+
if self.original_object_hook is not None:
79+
return self.original_object_hook(dct)
80+
81+
return dct

fsspec/registry.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -257,27 +257,33 @@ def get_filesystem_class(protocol):
257257
"""
258258

259259

260-
def _import_class(cls, minv=None):
261-
"""Take a string FQP and return the imported class or identifier
260+
def _import_class(fqp: str):
261+
"""Take a fully-qualified path and return the imported class or identifier.
262262
263-
cls is of the form "package.module.klass" or "package.module:subobject.klass"
263+
``fqp`` is of the form "package.module.klass" or
264+
"package.module:subobject.klass".
265+
266+
Warnings
267+
--------
268+
This can import arbitrary modules. Make sure you haven't installed any modules
269+
that may execute malicious code at import time.
264270
"""
265-
if ":" in cls:
266-
mod, name = cls.rsplit(":", 1)
267-
s3 = mod == "s3fs"
268-
mod = importlib.import_module(mod)
269-
if s3 and mod.__version__.split(".") < ["0", "5"]:
270-
warnings.warn(s3_msg)
271-
for part in name.split("."):
272-
mod = getattr(mod, part)
273-
return mod
271+
if ":" in fqp:
272+
mod, name = fqp.rsplit(":", 1)
274273
else:
275-
mod, name = cls.rsplit(".", 1)
276-
s3 = mod == "s3fs"
277-
mod = importlib.import_module(mod)
278-
if s3 and mod.__version__.split(".") < ["0", "5"]:
279-
warnings.warn(s3_msg)
280-
return getattr(mod, name)
274+
mod, name = fqp.rsplit(".", 1)
275+
276+
is_s3 = mod == "s3fs"
277+
mod = importlib.import_module(mod)
278+
if is_s3 and mod.__version__.split(".") < ["0", "5"]:
279+
warnings.warn(s3_msg)
280+
for part in name.split("."):
281+
mod = getattr(mod, part)
282+
283+
if not isinstance(mod, type):
284+
raise TypeError(f"{fqp} is not a class")
285+
286+
return mod
281287

282288

283289
def filesystem(protocol, **storage_options):

fsspec/spec.py

Lines changed: 76 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import io
4+
import json
45
import logging
56
import os
67
import threading
@@ -9,7 +10,7 @@
910
from errno import ESPIPE
1011
from glob import has_magic
1112
from hashlib import sha256
12-
from typing import ClassVar
13+
from typing import Any, ClassVar, Dict, Tuple
1314

1415
from .callbacks import DEFAULT_CALLBACK
1516
from .config import apply_config, conf
@@ -115,6 +116,10 @@ class AbstractFileSystem(metaclass=_Cached):
115116
#: Extra *class attributes* that should be considered when hashing.
116117
_extra_tokenize_attributes = ()
117118

119+
# Set by _Cached metaclass
120+
storage_args: Tuple[Any, ...]
121+
storage_options: Dict[str, Any]
122+
118123
def __init__(self, *args, **storage_options):
119124
"""Create and configure file-system instance
120125
@@ -1381,61 +1386,98 @@ def read_block(self, fn, offset, length, delimiter=None):
13811386
length = size - offset
13821387
return read_block(f, offset, length, delimiter)
13831388

1384-
def to_json(self):
1389+
def to_json(self) -> str:
13851390
"""
1386-
JSON representation of this filesystem instance
1391+
JSON representation of this filesystem instance.
13871392
13881393
Returns
13891394
-------
1390-
str: JSON structure with keys cls (the python location of this class),
1391-
protocol (text name of this class's protocol, first one in case of
1392-
multiple), args (positional args, usually empty), and all other
1393-
kwargs as their own keys.
1395+
JSON string with keys ``cls`` (the python location of this class),
1396+
protocol (text name of this class's protocol, first one in case of
1397+
multiple), ``args`` (positional args, usually empty), and all other
1398+
keyword arguments as their own keys.
1399+
"""
1400+
from .json import FilesystemJSONEncoder
1401+
1402+
return json.dumps(self, cls=FilesystemJSONEncoder)
1403+
1404+
@staticmethod
1405+
def from_json(blob: str) -> AbstractFileSystem:
13941406
"""
1395-
import json
1407+
Recreate a filesystem instance from JSON representation.
1408+
1409+
See ``.to_json()`` for the expected structure of the input.
1410+
1411+
Parameters
1412+
----------
1413+
blob: str
1414+
1415+
Returns
1416+
-------
1417+
file system instance, not necessarily of this particular class.
13961418
1419+
Warnings
1420+
--------
1421+
This can import arbitrary modules (as determined by the ``cls`` key).
1422+
Make sure you haven't installed any modules that may execute malicious code
1423+
at import time.
1424+
"""
1425+
from .json import FilesystemJSONDecoder
1426+
1427+
return json.loads(blob, cls=FilesystemJSONDecoder)
1428+
1429+
def to_dict(self) -> Dict[str, Any]:
1430+
"""
1431+
JSON-serializable dictionary representation of this filesystem instance.
1432+
1433+
Returns
1434+
-------
1435+
Dictionary with keys ``cls`` (the python location of this class),
1436+
protocol (text name of this class's protocol, first one in case of
1437+
multiple), ``args`` (positional args, usually empty), and all other
1438+
keyword arguments as their own keys.
1439+
"""
13971440
cls = type(self)
1398-
cls = ".".join((cls.__module__, cls.__name__))
1399-
proto = (
1400-
self.protocol[0]
1401-
if isinstance(self.protocol, (tuple, list))
1402-
else self.protocol
1403-
)
1404-
return json.dumps(
1405-
dict(
1406-
cls=cls,
1407-
protocol=proto,
1408-
args=self.storage_args,
1409-
**self.storage_options,
1410-
)
1441+
proto = self.protocol
1442+
1443+
return dict(
1444+
cls=f"{cls.__module__}:{cls.__name__}",
1445+
protocol=proto[0] if isinstance(proto, (tuple, list)) else proto,
1446+
args=self.storage_args,
1447+
**self.storage_options,
14111448
)
14121449

14131450
@staticmethod
1414-
def from_json(blob):
1451+
def from_dict(dct: Dict[str, Any]) -> AbstractFileSystem:
14151452
"""
1416-
Recreate a filesystem instance from JSON representation
1453+
Recreate a filesystem instance from dictionary representation.
14171454
1418-
See ``.to_json()`` for the expected structure of the input
1455+
See ``.to_dict()`` for the expected structure of the input.
14191456
14201457
Parameters
14211458
----------
1422-
blob: str
1459+
dct: Dict[str, Any]
14231460
14241461
Returns
14251462
-------
14261463
file system instance, not necessarily of this particular class.
1464+
1465+
Warnings
1466+
--------
1467+
This can import arbitrary modules (as determined by the ``cls`` key).
1468+
Make sure you haven't installed any modules that may execute malicious code
1469+
at import time.
14271470
"""
1428-
import json
1471+
from .json import FilesystemJSONDecoder
14291472

1430-
from .registry import _import_class, get_filesystem_class
1473+
cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct)
1474+
if cls is None:
1475+
raise ValueError("Not a serialized AbstractFileSystem")
14311476

1432-
dic = json.loads(blob)
1433-
protocol = dic.pop("protocol")
1434-
try:
1435-
cls = _import_class(dic.pop("cls"))
1436-
except (ImportError, ValueError, RuntimeError, KeyError):
1437-
cls = get_filesystem_class(protocol)
1438-
return cls(*dic.pop("args", ()), **dic)
1477+
dct.pop("cls", None)
1478+
dct.pop("protocol", None)
1479+
1480+
return cls(*dct.pop("args", ()), **dct)
14391481

14401482
def _get_pyarrow_filesystem(self):
14411483
"""

0 commit comments

Comments
 (0)