Skip to content

Commit 1439f9b

Browse files
committed
feat(gist): fsspec file system for GitHub gists (resolves #888)
1 parent ac7031b commit 1439f9b

File tree

4 files changed

+232
-3
lines changed

4 files changed

+232
-3
lines changed

docs/source/api.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ Built-in Implementations
117117
fsspec.implementations.dbfs.DatabricksFileSystem
118118
fsspec.implementations.dirfs.DirFileSystem
119119
fsspec.implementations.ftp.FTPFileSystem
120+
fsspec.implementations.gist.GistFileSystem
120121
fsspec.implementations.git.GitFileSystem
121122
fsspec.implementations.github.GithubFileSystem
122123
fsspec.implementations.http.HTTPFileSystem
@@ -162,6 +163,9 @@ Built-in Implementations
162163
.. autoclass:: fsspec.implementations.ftp.FTPFileSystem
163164
:members: __init__
164165

166+
.. autoclass:: fsspec.implementations.gist.GistFileSystem
167+
:members: __init__
168+
165169
.. autoclass:: fsspec.implementations.git.GitFileSystem
166170
:members: __init__
167171

fsspec/implementations/gist.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
import requests
2+
3+
import fsspec
4+
5+
from ..spec import AbstractFileSystem
6+
from ..utils import infer_storage_options
7+
from .memory import MemoryFile
8+
9+
10+
class GistFileSystem(AbstractFileSystem):
11+
"""
12+
Interface to files in a single GitHub Gist.
13+
14+
Provides read-only access to a gist's files. Gists do not contain
15+
subdirectories, so file listing is straightforward.
16+
17+
Parameters
18+
----------
19+
gist_id : str
20+
The ID of the gist you want to access (the long hex value from the URL).
21+
sha : str (optional)
22+
If provided, fetch a particular revision of the gist. If omitted,
23+
the latest revision is used.
24+
username : str (optional)
25+
GitHub username for authentication (required if token is given).
26+
token : str (optional)
27+
GitHub personal access token (required if username is given).
28+
timeout : (float, float) or float, optional
29+
Connect and read timeouts for requests (default 60s each).
30+
kwargs : dict
31+
Passed to AbstractFileSystem base class.
32+
"""
33+
34+
protocol = "gist"
35+
gist_url = "https://api.github.com/gists/{gist_id}"
36+
gist_rev_url = "https://api.github.com/gists/{gist_id}/{sha}"
37+
38+
def __init__(
39+
self, gist_id, sha=None, username=None, token=None, timeout=None, **kwargs
40+
):
41+
super().__init__(**kwargs)
42+
self.gist_id = gist_id
43+
self.sha = sha # revision of the gist (optional)
44+
if (username is None) ^ (token is None):
45+
# Both or neither must be set
46+
if username or token:
47+
raise ValueError("Auth requires both username and token, or neither.")
48+
self.username = username
49+
self.token = token
50+
# Default timeouts to 60s connect/read if none provided
51+
self.timeout = timeout if timeout is not None else (60, 60)
52+
53+
# We use a single-level "directory" cache, because a gist is essentially flat
54+
self.dircache[""] = self._fetch_file_list()
55+
56+
@property
57+
def kw(self):
58+
"""Auth parameters passed to 'requests' if we have username/token."""
59+
if self.username is not None and self.token is not None:
60+
return {"auth": (self.username, self.token)}
61+
return {}
62+
63+
def _fetch_gist_metadata(self):
64+
"""
65+
Fetch the JSON metadata for this gist (possibly for a specific revision).
66+
"""
67+
if self.sha:
68+
url = self.gist_rev_url.format(gist_id=self.gist_id, sha=self.sha)
69+
else:
70+
url = self.gist_url.format(gist_id=self.gist_id)
71+
72+
r = requests.get(url, timeout=self.timeout, **self.kw)
73+
if r.status_code == 404:
74+
raise FileNotFoundError(
75+
f"Gist not found: {self.gist_id}@{self.sha or 'latest'}"
76+
)
77+
r.raise_for_status()
78+
return r.json()
79+
80+
def _fetch_file_list(self):
81+
"""
82+
Returns a list of dicts describing each file in the gist. These get stored
83+
in self.dircache[""].
84+
"""
85+
meta = self._fetch_gist_metadata()
86+
files = meta.get("files", {})
87+
out = []
88+
for fname, finfo in files.items():
89+
if finfo is None:
90+
# Occasionally GitHub returns a file entry with null if it was deleted
91+
continue
92+
# Build a directory entry
93+
out.append(
94+
{
95+
"name": fname, # file's name
96+
"type": "file", # gists have no subdirectories
97+
"size": finfo.get("size", 0), # file size in bytes
98+
"raw_url": finfo.get("raw_url"),
99+
}
100+
)
101+
return out
102+
103+
@classmethod
104+
def _strip_protocol(cls, path):
105+
"""
106+
Remove 'gist://' from the path, if present.
107+
"""
108+
# The default infer_storage_options can handle gist://username:token@id/file
109+
# or gist://id/file, but let's ensure we handle a normal usage too.
110+
# We'll just strip the protocol prefix if it exists.
111+
path = infer_storage_options(path).get("path", path)
112+
return path.lstrip("/")
113+
114+
@staticmethod
115+
def _get_kwargs_from_urls(path):
116+
"""
117+
Parse 'gist://' style URLs into GistFileSystem constructor kwargs.
118+
For example:
119+
gist://:TOKEN@<gist_id>/file.txt
120+
gist://username:TOKEN@<gist_id>/file.txt
121+
"""
122+
so = infer_storage_options(path)
123+
out = {}
124+
if "username" in so and so["username"]:
125+
out["username"] = so["username"]
126+
if "password" in so and so["password"]:
127+
out["token"] = so["password"]
128+
if "host" in so and so["host"]:
129+
# We interpret 'host' as the gist ID
130+
out["gist_id"] = so["host"]
131+
return out
132+
133+
def ls(self, path="", detail=False, **kwargs):
134+
"""
135+
List files in the gist. Gists are single-level, so any 'path' is basically
136+
the filename, or empty for all files.
137+
138+
Parameters
139+
----------
140+
path : str, optional
141+
The filename to list. If empty, returns all files in the gist.
142+
detail : bool, default False
143+
If True, return a list of dicts; if False, return a list of filenames.
144+
"""
145+
path = self._strip_protocol(path or "")
146+
# If path is empty, return all
147+
if path == "":
148+
results = self.dircache[""]
149+
else:
150+
# We want just the single file with this name
151+
all_files = self.dircache[""]
152+
results = [f for f in all_files if f["name"] == path]
153+
if not results:
154+
raise FileNotFoundError(path)
155+
if detail:
156+
return results
157+
else:
158+
return sorted(f["name"] for f in results)
159+
160+
def _open(self, path, mode="rb", block_size=None, **kwargs):
161+
"""
162+
Read a single file from the gist.
163+
"""
164+
if mode != "rb":
165+
raise NotImplementedError("GitHub Gist FS is read-only (no write).")
166+
167+
path = self._strip_protocol(path)
168+
# Find the file entry in our dircache
169+
matches = [f for f in self.dircache[""] if f["name"] == path]
170+
if not matches:
171+
raise FileNotFoundError(path)
172+
finfo = matches[0]
173+
174+
raw_url = finfo.get("raw_url")
175+
if not raw_url:
176+
raise FileNotFoundError(f"No raw_url for file: {path}")
177+
178+
r = requests.get(raw_url, timeout=self.timeout, **self.kw)
179+
if r.status_code == 404:
180+
raise FileNotFoundError(path)
181+
r.raise_for_status()
182+
return MemoryFile(path, None, r.content)
183+
184+
def cat(self, path, recursive=False, on_error="raise", **kwargs):
185+
"""
186+
Return {path: contents} for the given file or files. If 'recursive' is True,
187+
and path is empty, returns all files in the gist.
188+
"""
189+
paths = self.expand_path(path, recursive=recursive)
190+
out = {}
191+
for p in paths:
192+
try:
193+
with self.open(p, "rb") as f:
194+
out[p] = f.read()
195+
except FileNotFoundError as e:
196+
if on_error == "raise":
197+
raise e
198+
elif on_error == "omit":
199+
pass # skip
200+
else:
201+
out[p] = e
202+
return out
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import pytest
2+
3+
import fsspec
4+
5+
6+
@pytest.mark.parametrize(
7+
"gist_id,sha",
8+
[("16bee4256595d3b6814be139ab1bd54e", "760905f9f222ad41b9c3fd8308cbbd016943c65a")],
9+
)
10+
def test_gist_public(gist_id, sha):
11+
fs = fsspec.filesystem("gist", gist_id=gist_id, sha=sha)
12+
# Listing
13+
all_files = fs.ls("")
14+
assert len(all_files) > 0
15+
# Cat
16+
data = fs.cat(all_files)
17+
assert set(data.keys()) == set(all_files)
18+
for k, v in data.items():
19+
assert isinstance(v, bytes)

fsspec/registry.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ def register_implementation(name, cls, clobber=False, errtxt=None):
122122
"err": "Please install gdrivefs for access to Google Drive",
123123
},
124124
"generic": {"class": "fsspec.generic.GenericFileSystem"},
125+
"gist": {
126+
"class": "fsspec.implementations.gist.GistFileSystem",
127+
"err": "Install the requests package to use the gist FS",
128+
},
125129
"git": {
126130
"class": "fsspec.implementations.git.GitFileSystem",
127131
"err": "Install pygit2 to browse local git repos",
@@ -225,9 +229,9 @@ def register_implementation(name, cls, clobber=False, errtxt=None):
225229
"zip": {"class": "fsspec.implementations.zip.ZipFileSystem"},
226230
}
227231

228-
assert list(known_implementations) == sorted(known_implementations), (
229-
"Not in alphabetical order"
230-
)
232+
assert list(known_implementations) == sorted(
233+
known_implementations
234+
), "Not in alphabetical order"
231235

232236

233237
def get_filesystem_class(protocol):

0 commit comments

Comments
 (0)