-
- Notifications
You must be signed in to change notification settings - Fork 33.4k
bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior #7891
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
5d58e8a 43a1bd4 57c92d4 679cd89 24214a7 055dffb 2cca938 ae032e7 File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,8 @@ | ||
| from test import support | ||
| from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, | ||
| STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, | ||
| open as tokenize_open, Untokenizer, generate_tokens) | ||
| open as tokenize_open, Untokenizer, generate_tokens, | ||
| NEWLINE) | ||
| from io import BytesIO, StringIO | ||
| import unittest | ||
| from unittest import TestCase, mock | ||
| | @@ -15,23 +16,34 @@ class TokenizeTest(TestCase): | |
| # Tests for the tokenize module. | ||
| | ||
| # The tests can be really simple. Given a small fragment of source | ||
| # code, print out a table with tokens. The ENDMARKER is omitted for | ||
| # brevity. | ||
| # code, print out a table with tokens. The ENDMARKER, ENCODING and | ||
| # final NEWLINE are omitted for brevity. | ||
| | ||
| def check_tokenize(self, s, expected): | ||
| # Format the tokens in s in a table format. | ||
| # The ENDMARKER is omitted. | ||
| # The ENDMARKER and final NEWLINE are omitted. | ||
| result = [] | ||
| f = BytesIO(s.encode('utf-8')) | ||
| num_lines = len(s.splitlines()) | ||
| for type, token, start, end, line in tokenize(f.readline): | ||
| if type == ENDMARKER: | ||
| break | ||
| if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines: | ||
| continue | ||
| type = tok_name[type] | ||
| result.append(f" {type:10} {token!r:13} {start} {end}") | ||
| self.assertEqual(result, | ||
| [" ENCODING 'utf-8' (0, 0) (0, 0)"] + | ||
| expected.rstrip().splitlines()) | ||
| | ||
| def test_implicit_newline(self): | ||
| # Make sure that the tokenizer puts in an implicit NEWLINE | ||
| # when the input lacks a trailing new line. | ||
| f = BytesIO("x".encode('utf-8')) | ||
| tokens = list(tokenize(f.readline)) | ||
| self.assertEqual(tokens[-2].type, NEWLINE) | ||
| self.assertEqual(tokens[-1].type, ENDMARKER) | ||
| | ||
| def test_basic(self): | ||
| self.check_tokenize("1 + 1", """\ | ||
| NUMBER '1' (1, 0) (1, 1) | ||
| | @@ -922,12 +934,15 @@ async def bar(): pass | |
| class GenerateTokensTest(TokenizeTest): | ||
| def check_tokenize(self, s, expected): | ||
| # Format the tokens in s in a table format. | ||
| # The ENDMARKER is omitted. | ||
| # The ENDMARKER and final NEWLINE are omitted. | ||
| result = [] | ||
| f = StringIO(s) | ||
| num_lines = len(s.splitlines()) | ||
| for type, token, start, end, line in generate_tokens(f.readline): | ||
| if type == ENDMARKER: | ||
| break | ||
| if s[-1] not in '\r\n' and type == NEWLINE and end[0] == num_lines: | ||
| continue | ||
| ||
| type = tok_name[type] | ||
| result.append(f" {type:10} {token!r:13} {start} {end}") | ||
| self.assertEqual(result, expected.rstrip().splitlines()) | ||
| | @@ -1022,8 +1037,8 @@ def readline(): | |
| else: | ||
| return b'' | ||
| | ||
| # skip the initial encoding token and the end token | ||
| tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] | ||
| # skip the initial encoding token and the end tokens | ||
| tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2] | ||
| expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] | ||
| self.assertEqual(tokens, expected_tokens, | ||
| "bytes not decoded with encoding") | ||
| | @@ -1039,8 +1054,8 @@ def readline(): | |
| else: | ||
| return b'' | ||
| | ||
| # skip the end token | ||
| tokens = list(_tokenize(readline, encoding=None))[:-1] | ||
| # skip the end tokens | ||
| tokens = list(_tokenize(readline, encoding=None))[:-2] | ||
| expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] | ||
| self.assertEqual(tokens, expected_tokens, | ||
| "string not tokenized when encoding is None") | ||
| | @@ -1351,18 +1366,21 @@ def test_oneline_defs(self): | |
| | ||
| # Test that 500 consequent, one-line defs is OK | ||
| toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) | ||
| self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER | ||
| self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER | ||
| # [-2] is always NEWLINE | ||
| | ||
| def assertExactTypeEqual(self, opstr, *optypes): | ||
| tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) | ||
| num_optypes = len(optypes) | ||
| self.assertEqual(len(tokens), 2 + num_optypes) | ||
| self.assertEqual(len(tokens), 3 + num_optypes) | ||
| self.assertEqual(tok_name[tokens[0].exact_type], | ||
| tok_name[ENCODING]) | ||
| for i in range(num_optypes): | ||
| self.assertEqual(tok_name[tokens[i + 1].exact_type], | ||
| tok_name[optypes[i]]) | ||
| self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type], | ||
| tok_name[token.NEWLINE]) | ||
| self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type], | ||
| tok_name[token.ENDMARKER]) | ||
| | ||
| def test_exact_type(self): | ||
| | @@ -1515,7 +1533,7 @@ def test_roundtrip(self): | |
| self.check_roundtrip("if x == 1:\n" | ||
| " print(x)\n") | ||
| self.check_roundtrip("# This is a comment\n" | ||
| "# This also") | ||
| "# This also\n") | ||
| | ||
| # Some people use different formatting conventions, which makes | ||
| # untokenize a little trickier. Note that this test involves trailing | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -492,8 +492,15 @@ def _tokenize(readline, encoding): | |
| # BOM will already have been stripped. | ||
| encoding = "utf-8" | ||
| yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') | ||
| last_line = b'' | ||
| line = b'' | ||
| while True: # loop over lines in stream | ||
| try: | ||
| # We capture the value of the line variable here because | ||
| # readline uses the empty string '' to signal end of input, | ||
| # hence `line` itself will always be overwritten at the end | ||
| # of this loop. | ||
| last_line = line | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't Member Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
How you're describing it is what I had initially but there's a few places in the loop where iteration can stop so I thought this would be simpler. Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then perhaps add a comment to that end? It seems rather crucial to understanding that particular piece of the code. Member Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, added. | ||
| line = readline() | ||
| except StopIteration: | ||
| line = b'' | ||
| | @@ -648,6 +655,9 @@ def _tokenize(readline, encoding): | |
| (lnum, pos), (lnum, pos+1), line) | ||
| pos += 1 | ||
| | ||
| # Add an implicit NEWLINE if the input doesn't end in one | ||
| if len(last_line) > 0 and last_line[-1] not in '\r\n': | ||
| ||
| yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') | ||
| for indent in indents[1:]: # pop remaining indent levels | ||
| yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') | ||
| yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| Tokenize module now implicitly emits a NEWLINE when provided with input that | ||
| does not have a trailing new line. This behavior now matches what the C | ||
| tokenizer does internally. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This check is quite unreadable. Not simple to make more readable though. A short comment could help.