Skip to content

Commit 5342d2e

Browse files
Replace the blib2to3 tokenizer with pytokens (#4536)
1 parent 9f38928 commit 5342d2e

File tree

9 files changed

+183
-1088
lines changed

9 files changed

+183
-1088
lines changed

.pre-commit-config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ repos:
5151
- click >= 8.1.0, != 8.1.4, != 8.1.5
5252
- packaging >= 22.0
5353
- platformdirs >= 2.1.0
54+
- pytokens >= 0.1.10
5455
- pytest
5556
- hypothesis
5657
- aiohttp >= 3.7.4

CHANGES.md

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
<!-- Changes to the parser or to version autodetection -->
3131

32+
- Rewrite tokenizer to improve performance and compliance (#4536)
3233
- Fix bug where certain unusual expressions (e.g., lambdas) were not accepted
3334
in type parameter bounds and defaults. (#4602)
3435

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ dependencies = [
6969
"packaging>=22.0",
7070
"pathspec>=0.9.0",
7171
"platformdirs>=2",
72+
"pytokens>=0.1.10",
7273
"tomli>=1.1.0; python_version < '3.11'",
7374
"typing_extensions>=4.0.1; python_version < '3.11'",
7475
]

src/blib2to3/pgen2/driver.py

+5-15
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from typing import IO, Any, Optional, Union, cast
2929

3030
from blib2to3.pgen2.grammar import Grammar
31-
from blib2to3.pgen2.tokenize import GoodTokenInfo
31+
from blib2to3.pgen2.tokenize import TokenInfo
3232
from blib2to3.pytree import NL
3333

3434
# Pgen imports
@@ -112,7 +112,7 @@ def __init__(self, grammar: Grammar, logger: Optional[Logger] = None) -> None:
112112
logger = logging.getLogger(__name__)
113113
self.logger = logger
114114

115-
def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) -> NL:
115+
def parse_tokens(self, tokens: Iterable[TokenInfo], debug: bool = False) -> NL:
116116
"""Parse a series of tokens and return the syntax tree."""
117117
# XXX Move the prefix computation into a wrapper around tokenize.
118118
proxy = TokenProxy(tokens)
@@ -180,27 +180,17 @@ def parse_tokens(self, tokens: Iterable[GoodTokenInfo], debug: bool = False) ->
180180
assert p.rootnode is not None
181181
return p.rootnode
182182

183-
def parse_stream_raw(self, stream: IO[str], debug: bool = False) -> NL:
184-
"""Parse a stream and return the syntax tree."""
185-
tokens = tokenize.generate_tokens(stream.readline, grammar=self.grammar)
186-
return self.parse_tokens(tokens, debug)
187-
188-
def parse_stream(self, stream: IO[str], debug: bool = False) -> NL:
189-
"""Parse a stream and return the syntax tree."""
190-
return self.parse_stream_raw(stream, debug)
191-
192183
def parse_file(
193184
self, filename: Path, encoding: Optional[str] = None, debug: bool = False
194185
) -> NL:
195186
"""Parse a file and return the syntax tree."""
196187
with open(filename, encoding=encoding) as stream:
197-
return self.parse_stream(stream, debug)
188+
text = stream.read()
189+
return self.parse_string(text, debug)
198190

199191
def parse_string(self, text: str, debug: bool = False) -> NL:
200192
"""Parse a string and return the syntax tree."""
201-
tokens = tokenize.generate_tokens(
202-
io.StringIO(text).readline, grammar=self.grammar
203-
)
193+
tokens = tokenize.tokenize(text, grammar=self.grammar)
204194
return self.parse_tokens(tokens, debug)
205195

206196
def _partially_consume_prefix(self, prefix: str, column: int) -> tuple[str, str]:

src/blib2to3/pgen2/pgen.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import IO, Any, NoReturn, Optional, Union
77

88
from blib2to3.pgen2 import grammar, token, tokenize
9-
from blib2to3.pgen2.tokenize import GoodTokenInfo
9+
from blib2to3.pgen2.tokenize import TokenInfo
1010

1111
Path = Union[str, "os.PathLike[str]"]
1212

@@ -18,7 +18,7 @@ class PgenGrammar(grammar.Grammar):
1818
class ParserGenerator:
1919
filename: Path
2020
stream: IO[str]
21-
generator: Iterator[GoodTokenInfo]
21+
generator: Iterator[TokenInfo]
2222
first: dict[str, Optional[dict[str, int]]]
2323

2424
def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
@@ -27,8 +27,7 @@ def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None:
2727
stream = open(filename, encoding="utf-8")
2828
close_stream = stream.close
2929
self.filename = filename
30-
self.stream = stream
31-
self.generator = tokenize.generate_tokens(stream.readline)
30+
self.generator = tokenize.tokenize(stream.read())
3231
self.gettoken() # Initialize lookahead
3332
self.dfas, self.startsymbol = self.parse()
3433
if close_stream is not None:

0 commit comments

Comments
 (0)